Setup

library(tidyverse)
library(ape)
library(phangorn)
library(pegas)
library(adegenet)
#library(strataG)
library(knitr)
library(emoji)
library(coda)
library(ggmcmc)
library(perm)
library(Rmpfr)
library(spatstat.explore)
library(colorspace)
library(latex2exp)
library(ggridges)
library(raster)
library(reshape2)
library(snpR)
library(graph4lg)
source("../IBD_Kernels/IBD_functions.R")

# Function library
harvest.model.likelihoods <- function(workingDir = workingDir,
                                      outfileName = "outfile.txt",
                                      multilocus = T){
    # this function harvests model marginal likelihoods for models calculated by
    # the program migrate-n (Beerli & Felsenstein 2001).
    # It takes as input a directory full of directories, 
    # each of which contains output from a migrate model, and is named
    # after that model. 
  
    #initialize a data frame to take the values
    modelMarglikes <- data.frame(model=character(),
                             thermodynamic=numeric(),
                             bezier.corrected=numeric(), 
                             harmonic=numeric()) 
    # loop through directories in the working directory, each of which is name
    # after a different model
  for(i in list.dirs(workingDir, full.names = F)[-1]){ #i<-"stepping.stone"
      modelDir<-file.path(workingDir,i)
      print(modelDir)
    #scan in the outfile, separating at each newline
      outfile<-scan(file=file.path(modelDir,outfileName),what="character",sep="\n") 
    #find the line with the likelihoods on it and split on runs of spaces
      marglikeline <- grep("Scaling factor",outfile,value=F)-1
      marglikeline <- strsplit(outfile[marglikeline],
                               "\\s+", perl = T)[[1]][3:5]
    #  if(length(marglikeline)==0){next}
      marglikes <- c(i,marglikeline)
     
      modelMarglikes <- rbind(modelMarglikes,marglikes, deparse.level = 2)
  }
  names(modelMarglikes) <- c("model","thermodynamic","bezier.corrected","harmonic")
  modelMarglikes[2:4] <- sapply(modelMarglikes[2:4], as.numeric)
  return(modelMarglikes)
}

bfcalcs<-function(df,ml="bezier.corrected"){
  # This calculates log bayes factors on data frames output by
  # harvest.model.likelihoods(), following Johnson and Omland (2004)
  # You may choose the likelihood flavor with
  # ml = "bezier.corrected", "thermodynamic" or "harmonic"
  #df$thermodynamic <- as.numeric(df$thermodynamic)
  #df$bezier.corrected <- as.numeric(df$bezier.corrected)
  #df$harmonic <- as.numeric(df$harmonic)
  mlcol <- df[[ml]] 
    bmvalue <- mlcol[which.max(mlcol)]
    lbf <- 2*(mlcol-bmvalue)
    choice <- rank(-mlcol)
    modelprob <- exp(lbf/2)/sum(exp(lbf/2))
    dfall <- cbind(df,lbf,choice,modelprob)
    return(dfall)
}   

migrants.per.gen<-function(x){
  #a function for creating Nm vectors out of m and Theta vectors.
  #x<-x[[1]]
  m<-names(x)[which(grepl("M_",names(x)))] #names of m columns
  #theta<-names(x)[which(grepl("Theta_",names(x)))] #names of theta columns
  for(n in m){
    t<-paste("Theta",strsplit(n,split="_")[[1]][3],sep="_")
    x[,paste("Nm",strsplit(n,split="_")[[1]][2],strsplit(n,split="_")[[1]][3],sep="_")]<- x[,which(names(x)==n)]*x[,which(names(x)==t)] 
    #this hairy little statement makes a new column named "Nm_X_Y" and then fills it by multiplying the M_X_Y column by the Theta_Y column  
  }
  return(x)
}

Introduction

Rob Toonen asked me to take a look at this dataset sequenced and analyzed by ’Ale’alani Dudoit.

From Rob:

It is a zoanthid - Palythoa tuberculosa - that is most common in anthropogenically disturbed habitats. It seems like it could be a cool story, and she has plenty of SNPs or contigs to do whatever analysis you think would be most interesting to work up.

’Ale’a:

Mahalo for checking out our data Eric, would be great to try out Migrate-n and see what results come of it. A couple other folks in the lab had similar interesting fst/structure results with their data (corals and fish) where O’ahu is more genetically similar to Kure/Midway atoll than to neighboring islands. Would be interesting to see if we get any cool results as Rob mentioned with military transport b/w O’ahu and Midway during WWII. I can send along my TotalRawSNPs and Filtered vcf files through scp command if you have a destination path you can provide me. Please let me know what other info you need from me, happy to send along.

Eric:

So it turns out that dDocent (or actually an associated perl script) can make haplotypes, but it will need the bam files for each individual. Supposedly we have unlimited storage on Google Drive at PSU, so I’m going to test that. I’ve shared a folder that you can use to upload the bam files.

Transfer files

So, now I’ve received all .bam and .bai files for each individual, as well as locality metadata and total and filtered SNP datasets from ’Ale’a on Google Drive. I compressed these on my mac, and now need to transfer them to Argonaute. I will do this with the gdown python package (pip install gdown). This involves looking up the google id for each file in the URL of the share link and pasting that into gdown thusly

gdown https://drive.google.com/uc?id=19nagmzHPQgKE4PPlyHikcWc7lBZFMbXB --output ptuberculosa_metadata.csv
gdown https://drive.google.com/uc?id=19g9fMRdPH-_AG-l6kytu-lv6kd3qYpQD --output popmap
gdown https://drive.google.com/uc?id=1ASB4yOnwDzDhFPA7Pev4CyIl6M6SX8_P --output popmap.csv
gdown https://drive.google.com/uc?id=19ZsobdvBIs3bQukW0Q-ivpdlJEdGguns --output reference.fasta
gdown https://drive.google.com/uc?id=19XNrJvZKMQ3L7qTIuExajc5KY6-gj3P8 --output mm0.9minq30mac3thin500.recode.vcf
gdown https://drive.google.com/uc?id=19vth-3IMkfMfxq4cp0QLOUgC2yFqnZ3C --output TotalRawSNPs.vcf
gdown https://drive.google.com/uc?id=1I0ib-QX4eyedctD951os6gvAMUe761JU --output bam_files.zip


gdown https://drive.google.com/uc?id=1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch

Isolation by Distance

I’m going to use the pairwise Fst values reported by ’Ale’a in a quick IBD plot. Here I’m importing the Fst and calculating the great-circle distances.

fst <- read_csv("figures/FST_table.csv")
## Rows: 9 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Pop
## dbl (9): Holaniku, Manawai, Kamakuokamohoali'i, Lalo, Kaua'i, O'ahu, Moloka'...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fst <- as.dist(fst[,2:10])
fst <- fst / (1-fst) #linearize it
islands <- read_csv("figures/hypotheses_as_graphs/hawaii_vertices.csv")
## Rows: 15 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): island, haole_name, sector, sampled, label
## dbl (3): vertex, lat, long
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
islands <- islands[rev(row.names(islands)),] %>% filter(sampled == "black")
gcdist <-  as.dist(pointDistance(islands[,8:7], lonlat=T)/1000)

attr(gcdist, "Labels") <- islands$label
attr(fst, "Labels") <- islands$label

#write.csv(as.matrix(fst), "figures/linearizedFst.csv", row.names = F, quote=F)
#write.csv(as.matrix(gcdist), "figures/gcdists.csv", row.names = F, quote=F)

Here’s the IBD plot

mantelt <- mantel.randtest(fst,gcdist, nrepet = 10000)

#trying to keep the population labels
test <- tibble(distance=melt(as.matrix(gcdist), varnames = c("row", "col")),
                    fst=melt(as.matrix(gcdist)))
## Warning in melt(as.matrix(gcdist), varnames = c("row", "col")): The melt
## generic in data.table has been passed a matrix and will attempt to redirect to
## the relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(as.matrix(gcdist)). In the next version, this
## warning will become an error.
## Warning in melt(as.matrix(gcdist)): The melt generic in data.table has been
## passed a matrix and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(as.matrix(gcdist)). In the next version, this warning will
## become an error.
distances <- tibble(distance=as.vector(gcdist),fst=as.vector(fst), label =as.vector(attr(fst, "Labels")))

lmodel <- lm(fst ~ distance , distances)

slope <- round(lmodel$coefficients[2],7)
mantelr <- round(mantelt$obs, 2)
pvalue <- round(mantelt$pvalue, 5)

lmodel_plot <- ggplot(distances,aes(x=distance,y=fst)) +
              geom_point() + 
                          geom_smooth(method=lm) + 
                          xlab("Geographic Distance (km)") + 
             ylab(expression(F["ST"]/1-F["ST"])) + 
             geom_text(label = paste("m =", slope, 
                          "; Mantel r =", mantelr,
                         ", p =", pvalue ), 
                  mapping = aes(x = 700, y = -0.002))

lmodel_plot
## `geom_smooth()` using formula = 'y ~ x'

But let’s remove Oahu because we know there has been contact between this population and the NWHI via the Navy.

fst <- as.matrix(fst)
fst <- fst[-4,-4] %>% as.dist()

gcdist <- as.matrix(gcdist)
gcdist <- gcdist[-4,-4] %>% as.dist()

mantelt <- mantel.randtest(fst,gcdist, nrepet = 10000)

distances <- tibble(distance=as.vector(gcdist),fst=as.vector(fst))

lmodel <- lm(fst ~ distance , distances)

slope <- round(lmodel$coefficients[2],7)
mantelr <- round(mantelt$obs, 2)
pvalue <- round(mantelt$pvalue, 5)

lmodel_plot <- ggplot(distances,aes(x=distance,y=fst)) +
              geom_point() + 
                          geom_smooth(method=lm) + 
                          xlab("Geographic Distance (km)") + 
             ylab(expression(F["ST"]/1-F["ST"])) + 
             geom_text(label = paste("m =", slope, 
                          "; Mantel r =", mantelr,
                         ", p =", pvalue ), 
                  mapping = aes(x = 700, y = -0.002))

lmodel_plot
## `geom_smooth()` using formula = 'y ~ x'

Haplotypes from dDocent

Filter SNPs

‘Ale’ already filtered the SNPs quite a bit, but I need to start that process over because coalescent programs are sensitive to ascertainment bias. Also, need to knit them back into haplotypes. As Peter Beerli says on page 15 of the migrate-n manual:

We use a rather restrictive ascertainment models for SNPs ?. A better approach than using SNPs is the use of short reads which may or many not contain SNPs. I find that SNPs are an inferior datatype because commonly researchers are adding criteria such as a minor SNP allele must occur at a frequency higher than x, and singletons are excluded etc.

We have found ALL variable sites and use them even if there are only a few members of another alleles present. In principal it is as you would sequence a stretch of DNA and then remove the invariant sites. Each stretch is treated as completely linked. You can combine many of such “loci” to improve your estimates.

I’ve installed dDocent into a dedicated environment following instructions here.

conda activate ddocent_env

Following the tutorial, but altering filtering parameters. Keeping all SNPs with even a single occurence (Minor Allele Count = 1), but they must occur in 90% of individuals (mac 1) and be of high quality (minQ 30). We are keeping genotypes with as few as 3 reads. This is because evidence to keep these SNPs is based on their existence in multiple individuals.

vcftools --vcf ../TotalRawSNPs.vcf --max-missing 0.1 --mac 1 --minQ 30 --minDP 3 --recode --recode-INFO-all --out rawg.1mac1dp3

After filtering, kept 93 out of 93 Individuals Outputting VCF file… After filtering, kept 28128 out of a possible 52804 Sites Run Time = 9.00 seconds

Run this error checking script to see the potential impact of shallow-sequenced genotypes

ErrorCount.sh rawg.1mac1dp3.recode.vcf

<!-- This script counts the number of potential genotyping errors due to low read depth -->
<!-- It report a low range, based on a 50% binomial probability of observing the second allele in a heterozygote and a high range based on a 25% probability. -->
<!-- Potential genotyping errors from genotypes from only 1 read range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 2 reads range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 3 reads range from 3808.875 to 12797.82 -->
<!-- Potential genotyping errors from genotypes from only 4 reads range from 3860.25 to 19517.424 -->
<!-- Potential genotyping errors from genotypes from only 5 reads range from 913.78125 to 6930 -->
<!-- 93 number of individuals and 28128 equals 2615904 total genotypes -->
<!-- Total genotypes not counting missing data 2508832 -->
<!-- Total potential error rate is between 0.0034210765208670807 and 0.015642834593946504 -->
<!-- SCORCHED EARTH SCENARIO -->
<!-- WHAT IF ALL LOW DEPTH HOMOZYGOTE GENOTYPES ARE ERRORS????? -->
<!-- The total SCORCHED EARTH error rate is 0.04841934414101861. -->

Look at missing data per individual

vcftools --vcf rawg.1mac1dp3.recode.vcf --missing-indv

mawk '!/IN/' out.imiss | cut -f5 > totalmissing
gnuplot << \EOF 
set terminal dumb size 120, 30
set autoscale 
unset label
set title "Histogram of % missing data per individual"
set ylabel "Number of Occurrences"
set xlabel "% of missing data"
#set yr [0:100000]
binwidth=0.01
bin(x,width)=width*floor(x/width) + binwidth/2.0
plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
                                                                                                                        
                                         Histogram of % missing data per individual                                     
     20 +-----------------------------------------------------------------------------------------------------------+   
        |    * *      +            +             +            +             +            +             +            |   
        |    * **                                             'totalmissing' using (bin($1,binwidth)):(1.0) ******* |   
        |    * **                                                                                                   |   
        |    * **                                                                                                   |   
        |    * **                                                                                                   |   
     15 |-+  * **                                                                                                 +-|   
        |    * **                                                                                                   |   
        |   ** **                                                                                                   |   
        |   ** **                                                                                                   |   
        |   ** **                                                                                                   |   
     10 |-+ ** **                                                                                                 +-|   
        |   ** **                                                                                                   |   
        |   ** ***                                                                                                  |   
        |   ** ***                                                                                                  |   
        |   ** ***                                                                                                  |   
        |   ** *****                                                                                                |   
      5 |-+ ** *** *                                                                                              +-|   
        |  *** *** ****                                                                                             |   
        |  *** *** ** *                                                                                             |   
        |  *** *** ** *                                                                                             |   
        |  *** *** ** *******                                                                                       |   
        |***** *** ** *** * *******************************************************************************         |   
      0 +-----------------------------------------------------------------------------------------------------------+   
        0            0.1          0.2           0.3          0.4           0.5          0.6           0.7          0.8  
                                                      % of missing data                                                 

‘Ale’ has probably removed any really bad individuals already. I’ll keep all of these for now.

Now filter based on missingness within populations. Puritz wrote a script for this that creates lots of output, so I created ./filterpop directory to run it in. I am removing loci that are missing 10% of data in any of 10 populations. I altered popmap to remove spaces from pop names too.

pop_missing_filter.sh ../rawg.1mac1dp3.recode.vcf ../../popmap 0.1 10 rawg1mac1dp3allpops.1.vcf

This filter keeps loci with Allele Balance between 0.25 and 0.75. From Jon:

Allele balance is: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous Because RADseq targets specific locations of the genome, we expect that the allele balance in our data (for real loci) should be close to 0.5

So this will keep only really high quality SNPs

 vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" ./filterpops/rawg1mac1dp3allpops.1.vcf.recode.vcf > rawg.1mac1dp3allpop.1.ABfil.vcf
 
 mawk '!/#/' *ABfil.vcf | wc -l
2756

Now remove SNPs that occur on both strands of a read. Jon:

Unless you are using super small genomic fragment or really long reads (MiSeq). A SNP should be covered only by forward or only reverse reads. The filter is based on proportions, so that a few extraneous reads won’t remove an entire locus…. In plain english, it’s keeping loci that have over 100 times more forward alternate reads than reverse alternate reads and 100 times more forward reference reads than reverse reference reads along with the reciprocal….That only removes a small proportion of loci, but these loci are likely to be paralogs, microbe contamination, or weird PCR chimeras.

```{basjh. eval = F} vcffilter -f “SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100” -s rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf

mawk ‘!/#/’ rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf | wc -l 7


Hmmm... that's a bit too stringent... maybe these were done on a MiSeq?


> The next filter looks at the ratio of mapping qualities between reference and alternate alleles...The rationale here is that, again, because RADseq loci and alleles all should start from the same genomic location there should not be large discrepancy between the mapping qualities of two alleles.


```bash
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf
mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf | wc -l
2512

Yet another filter that can be applied is whether or not their is a discrepancy in the properly paired status of for reads supporting reference or alternate alleles….Since de novo assembly is not perfect, some loci will only have unpaired reads mapping to them. This is not a problem. The problem occurs when all the reads supporting the reference allele are paired but not supporting the alternate allele. That is indicative of a problem.

vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf

mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf| wc -l
2141
dDocent_filters rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf 
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed. 

Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters 

Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
 1192 of 2141 

Are reads expected to overlap?  In other words, is fragment size less than 2X the read length?  Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
 0 of 949 

Number of sites filtered based on high depth and lower than 2*DEPTH quality score
 78 of 949 

                                                                                                                        
                                                                                                                        
                                               Histogram of mean depth per site                                         
       30 +---------------------------------------------------------------------------------------------------------+   
          |    +    +   **+    +    +    +     +    +    +    +     +    +    +    +     +    +    +    +     +    +|   
          |             **                                'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |   
          |         *** **                                                                                          |   
       25 |-+       * * **                                                                                        +-|   
          |        ** ****    **                                                                                    |   
          |        ** ****    **                                                                                    |   
          |        ** ****    **                                                                                    |   
       20 |-+   ** ** ****    **                                                                                  +-|   
          | **  ** ** ****    **                                                                                    |   
          | **  ** ** *****   ** **                                                                                 |   
       15 |***  ** ** *****   ** **                                                                               +-|   
          |**** ** ** *****   ** **                                                                                 |   
          |**** ** ** ******* ** **                                                                                 |   
          |**** ***** ******* ** **              **                                                                 |   
       10 |********** *************   ***        **                                                               +-|   
          |********** ************** ****     ** **                                                                 |   
          |********** ************** ****     ** **                                                                 |   
          |********** ********************** *** ***          *****                                                 |   
        5 |********** ******************** * ************     * ***                                               +-|   
          |********** ******************** **************  ** * *******  **                                         |   
          |********** ******************** ******************** ****************** ************ *** ***  ***       *|   
          |********** ******************** ******************** ************ *** *** ****** * *** *** **** *********|   
        0 +---------------------------------------------------------------------------------------------------------+   
          10   15   20    25   30   35   40    45   50   55   60    65   70   75   80    85   90   95  100   105  110   
                                                          Mean Depth                                                    
                                                                                                                        
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 10141.7555
The 95% cutoff would be 101
Would you like to use a different maximum mean depth cutoff than 101, yes or no
no
Number of sites filtered based on maximum mean depth
 81 of 949 

Number of sites filtered based on within locus depth mismatch
 36 of 868 

Total number of sites filtered
 1309 of 2141 

Remaining sites
 832 

Filtered VCF file is called rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.FIL.recode.vcf

Filter stats stored in rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.filterstats
(ddocent_env) ecrandall@Argonaute:~/eric_data/ptuberculosa/filtering$ 

This script apparently does many of the other steps that I did above manually, so I started from an earlier checkpoint

dDocent_filters  rawg1mac1dp3allpops.1.vcf.recode.vcf  rawg.1mac1dp3allpop.1.dDfil.vcf 
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed. 

Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters 

Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
 23877 of 25261 

Are reads expected to overlap?  In other words, is fragment size less than 2X the read length?  Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
 197 of 1384 

Number of sites filtered based on high depth and lower than 2*DEPTH quality score
 1673 of 1187 

                                                                                                                        
                                                                                                                        
                                               Histogram of mean depth per site                                         
       60 +---------------------------------------------------------------------------------------------------------+   
          | +    +     +    +     +    +    +     +    +     +    +     +    +    +     +    +     +    +     +    +|   
          |                                               'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |   
          |           **                                                                                            |   
       50 |-+         **                                                                                          +-|   
          |           **                                                                                            |   
          |        ** **                                                                                            |   
          |        ** **   **                                                                                       |   
       40 |-+      ** **   **                                                                                     +-|   
          |    **  ** **   **                                                                                       |   
          |    ** *** ***  ** **                                                                                    |   
       30 |-+  *********** *****                                                                                  +-|   
          | ** *********** *****                                                                                    |   
          | ** *****************                                                                                    |   
          | ********************                                                                                    |   
       20 |-********************  **                                                                              +-|   
          |********************** ***                                                                               |   
          |********************** ***                                                                               |   
          |****************************     *                                                                       |   
       10 |**************************** *** *                                                                     +-|   
          |******************************** **   **    ***** **                                                     |   
          |**************************************************** **       **         ***                             |   
          |************************************************************************** ****  **  ** +  *** **  +*****|   
        0 +---------------------------------------------------------------------------------------------------------+   
            12   18    24   30    36   42   48    54   60    66   72    78   84   90    96  102   108  114   120  126   
                                                          Mean Depth                                                    
                                                                                                                        
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 11589.30825
The 95% cutoff would be 116
Would you like to use a different maximum mean depth cutoff than 116, yes or no
no
Number of sites filtered based on maximum mean depth
 109 of 1187 

Number of sites filtered based on within locus depth mismatch
 21 of 1072 

Total number of sites filtered
 24210 of 25261 

Remaining sites
 1051 

Filtered VCF file is called rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf

Filter stats stored in rawg.1mac1dp3allpop.1.dDfil.vcf.filterstats

mv rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf rawg.1mac1dp3allpop.1.dDfil.vcf

Now to filter by HWE

The next filter to apply is HWE. Heng Li also found that HWE is another excellent filter to remove erroneous variant calls. We don’t want to apply it across the board, since population structure will create departures from HWE as well. We need to apply this by population. I’ve included a perl script written by Chris Hollenbeck, one of the PhD student’s in my current lab that will do this for us.

Let’s filter our SNPs by population specific HWE First, we need to convert our variant calls to SNPs To do this we will use another command from vcflib called vcfallelicprimatives

vcfallelicprimitives rawg.1mac1dp3allpop.1.dDfil.vcf --keep-info --keep-geno > rawg.1mac1dp3allpop.1.dDfil.prim.vcf

This will decompose complex variant calls into phased SNP and INDEL genotypes and keep the INFO flags for loci and genotypes. Next, we can feed this VCF file into VCFtools to remove indels.

vcftools --vcf rawg.1mac1dp3allpop.1.dDfil.prim.vcf --remove-indels --recode --recode-INFO-all --out SNP.rawg.1mac1dp3allpop.1.dDfil

Outputting VCF file… After filtering, kept 1096 out of a possible 1126 Sites Run Time = 0.00 seconds

Now apply the HWE filter, in filterhwe folder

filter_hwe_by_pop.pl -v ../SNP.rawg.1mac1dp3allpop.1.dDfil.recode.vcf -p ../../popmap -o SNP.rawg.1mac1dp3allpop.1.dDfil.HWE -h 0.01 -c 0.1

<!-- Processing population: BigIsland (17 inds) -->
<!-- Processing population: FrenchFrigateShoals (9 inds) -->
<!-- Processing population: Kauai (15 inds) -->
<!-- Processing population: Kure (10 inds) -->
<!-- Processing population: MaroReef (5 inds) -->
<!-- Processing population: Maui (10 inds) -->
<!-- Processing population: Molokai (10 inds) -->
<!-- Processing population: Oahu (10 inds) -->
<!-- Processing population: Pearl_Hermes (4 inds) -->
<!-- Processing population: PioneerBanks (3 inds) -->
<!-- Outputting results of HWE test for filtered loci to 'filtered.hwe' -->
<!-- Kept 997 of a possible 1096 loci (filtered 99 loci) -->

Hmm… the HWE filter is not working great, likely because of low sample sizes. It’s leaving me with mostly loci without individuals with alternate homozygotes

conda deactivate

Haplotyping

rad_haplotyper.pl

Chris Hollenbeck has created a script that can get haplotypes from dDocent output. First, gotta remove the -RG from the .bam and .bai files.

for f in *.bam; do mv "$f" "${f%-RG.bam}.bam" ; done
for f in *.bam.bai; do  mv "$f" "${f%-RG.bam.bai}.bam.bai" ; done

Now, to install the rad_haplotyper environment following these instructions

hmmm … rad_haplotyper isn’t working because the Vcf Perl module changed its name to VCF… breaking the script. I tried a few minor modifications but couldn’t fix it. Created an issue in the github site.

Remember to remove the environment: $ conda remove rad_haplotyper_env

Microhaplot

I am now going to try microhaplot by Thomas Ng and Eric Anderson!!

Now, have to convert the bam files to sam files.

for bam in *.bam                                                                                                      
do
echo $bam
samtools view -h -o ../sam_files/${bam%.bam}.sam $bam
done

for bam in *.bam                                                                                                      
do
echo $bam
mv $bam ${bam%-RG.bam}.bam
done

for bam in *.bai                                                                                                      
do
echo $bam
mv $bam ${bam%-RG.bam.bai}.bam.bai
done

for bam in *.bam                                                                                                      
do
echo $bam
samtools idxstats $bam > ./idxstats/${bam%.bam}_idxstats.txt
done


for bam in *.bam                                                                                                      
do
echo $bam
samtools view -h -o /Users/edc5240/Datasets/Ptuberculosa/sam_files/${bam%.bam}.sam $bam
done

Tutorial

Do the tutorial

library(microhaplot)
shiny_dir <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/Ptuberculosa_data/shiny_dir"
microhaplot::mvShinyHaplot(shiny_dir)

app.path <- file.path(shiny_dir, "microhaplot")
microhaplot::runShinyHaplot(app.path)

Load Data

Then follow along here to get data into microhaplot.

library(microhaplot)

run.label <- "Ptuberculosa"

path <- "/Users/edc5240/Datasets/Ptuberculosa"
sam.path <- "/Users/edc5240/Datasets/Ptuberculosa/sam_files"
# untar(system.file("extdata",
#                   "sebastes_sam.tar.gz",
#                   package="microhaplot"),
#       exdir = sam.path)


label.path <- file.path(path, "labels.txt")
vcf.path <- file.path(path, "mm0.9minq30mac3thin500.recode.vcf")

mvShinyHaplot(file.path(path,"shiny_dir"))
app.path <- file.path(path,"shiny_dir", "microhaplot")

haplo.read.tbl <- prepHaplotFiles(run.label = run.label,
                            sam.path = sam.path,
                            out.path = path,
                            label.path = label.path,
                            vcf.path = vcf.path,
                            app.path = app.path)

runShinyHaplot(path = "/Users/edc5240/Datasets/Ptuberculosa/shiny_dir/microhaplot")

Working with the data in microhaplot was OK, but I kept getting unexplained N’s in the output, and it was going to take some coding effort to translate the output into migrate format. Moreover, microhaplot only kept the variable sites, which isn’t ideal for migrate’s mutation model. So I am giving up on this front, and am going to try to generate haplotypes with Stacks.

Haplotypes from Stacks

Setup

Need first to copy all the fastq files over from Google Drive, where ’Ale’a put them. I split them into 4 tranches to keep the number of files under 50 as required by gdown. Note that the folder protocol won’t take urls, but just the ID.

gdown --id 1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch
gdown --id 1Ofta4RLcf6vddIiAA6dLCkdrYngADW1d --folder --remaining-ok --output secondbatch
gdown --id 1Og6JQNlhgprYvLTxJtMBlCCj72qRxLOw --folder --remaining-ok --output thirdbatch
gdown --id 1Ojnlj3L_8UFoPUVxaqvLKTLs8qq_S9ml --folder --remaining-ok --output fourthbatch

Need to rename the files into Illumina format in order for Stacks to recognize the paired reads

cd raw

for f in *.F.fq.gz
do
mv $f ${f/_1.F.fq.gz/_R1_001.fastq.gz}
done

for f in *.R.fq.gz
do
mv $f ${f/_1.R.fq.gz/_R2_001.fastq.gz}
done

cd ..

Process RadTags

’Ale’a said:

It was an ezRAD protocol developed by the Tobo lab. The restriction enzymes used were both MboI and Sau3AI. The sequences have indexes removed but still need to be trimmed/ends cleaned.

So I will process the radtags to remove adapters and the sau3AI cut site (GATC, which is the same as MboI). This command cleans data removing reads with uncalled bases, low phred scores, rescue barcodes and rad-tags.


process_radtags -p ./raw/ -o ./cleaned/ --rescue --clean --quality --paired --filter_illumina --renz-1 sau3AI \
--adapter-1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
--adapter-2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT

724964572 total sequences 0 failed Illumina filtered reads (0.0%) 227835784 reads contained adapter sequence (31.4%) 0 barcode not found drops (0.0%) 28941251 low quality read drops (4.0%) 63374437 RAD cutsite not found drops (8.7%) 404813100 retained reads (55.8%)

Pipeline

Hmm… gotta rename the cleaned files too

cd cleaned
for f in *001.1.fq.gz
do
mv $f ${f/_R[12]_001.1.fq.gz/.1.fq.gz}
done

for f in *001.2.fq.gz
do
mv $f ${f/_R[12]_001.2.fq.gz/.2.fq.gz}
done

I’m going to use the denovo_map.pl pipeline. This command will take data from ./cleaned and the popmap I created for dDocent and -m 3 reads per stack, -n 4 distance between stacks, -M 4 distance between catalog loci. Running on 12 -T threads and only keeping loci that appear in 75% of individuals in all 10 populations

denovo_map.pl --samples ./cleaned/ --popmap ../popmap --out-path ./pipeline --paired \
-m 3 -n 4 -M 4 -T 12 -r 75 -p 10 -X "populations: --fasta-samples" -X "populations: --filter-haplotype-wise"

Copy it down

scp -r  ecrandall@128.118.123.64:eric_data/ptuberculosa/stacks/pipeline/output1 ./stacks_output              

Populations

Now I need to re-run populations to get more statistics. Original populations output is in output1, this will be in output2.

populations -P ../ -O ./ -M ../../../popmap -t 12 -p 10 -r 75 -H --hwe --fstats --p-value-cutoff 0.99 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw 

Fasta2Genotype

Paul Maier created this script to convert Stacks haplotypes to migrate format. I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work.

python2 /Applications/migrate/migrate-4.4.4/fasta2genotype/fasta2genotype.py populations.samples2.fa NA ../popmap.tsv  populations.snps.vcf ptuberculosa.mig
###################################################################
###                                                             ###
###       Fasta2Genotype | Data Conversion | Version 1.10       ###
###                                                             ###
###                        Cite as follows:                     ###
###                                                             ###
###   Maier P.A., Vandergast A.G., Ostoja S.M., Aguilar A.,     ###
###   Bohonak A.J. (2019). Pleistocene glacial cycles drove     ###
###   lineage diversification and fusion in the Yosemite toad   ###
###   (Anaxyrus canorus). Evolution, in press.                  ###
###   https://www.doi.org/10.1111/evo.13868                     ###
###                                                             ###
###################################################################

Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1
Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2
Coverage Cutoff (number reads for locus)? Use '0' to ignore coverage: 0
Remove monomorphic loci? [1] Yes [2] No: 2
Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1
Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5
Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2
Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2
 
**************************************************************************************************************
***                                       ... BEGINNING CONVERSION ...                                     ***
**************************************************************************************************************
 
Cataloging loci...
Counting locus lengths...
Cataloging populations...
Counting gene copies...
Counting alleles for each locus...
Identifying loci with excess heterozygosity...
     Calculating observed heterozygosity and homozygosity...
     Calculating expected heterozygosity and homozygosity...
     Flagging loci with excess heterozygosity for removal...
     Removing loci...
     Removed 2 overly heterozygous loci.
Outputting migrate-n file...
*** DONE! ***

Then I manually re-ordered the populations to run from north to south in the output file, and moved it into the migrate folder.

Migrate

Install Migrate

Install the parallel version of Migrate on Nautilus server. I hade to remove -fvectorize from line 100 of the makefile to get this to work.

curl https://peterbeerli.com/migrate-html5/download_version4/migrate-newest-src.tar.gz > migrate.tar.gz
tar -xvf migrate.tar.gz
cd migrate-4.4.4/
cd src
./configure
make mpis (for parallel running)
sudo cp migrate-n-mpi /usr/local/bin/migrate-n-mpi

Locus Statistics & Mutation Model

These RAD loci are much less variable than COI that I’m used to using. I need to figure out an overall mutation model to use with RAD loci. I can’t find any discussion of this issue on the Migrate Google Group, so I created one. I guess I’ll use modelTest in the phangorn package to see where that gets me.

I renamed the FASTA headers in populations.samples.fa with BBEdit:

Find: >CLocus_\d+_Sample_\d+_Locus_(\d+)_Allele_([01]) \[(.+)\]
Replace: >\3_L\1_A\2

Lets load in the data and calculate some statistics for each locus. Previously Migrate-n only implemented the F84 (=HKY) model, with two rates (Transistions and Transversions) and gamma distributed rate variability. The new v4 has a datamodel parameter that suggests it might take other models of molecular evolution, but the possible models are not listed in the documentation! So I will just fit an HKY model.

note the interface lists these possible models: 1 Jukes-Cantor model 2 Kimura 2-parameter model 3 Felsenstein 1981 (F81) model 4 Felsenstein 1984 (F84) model 5 Hasegawa-Kishino-Yano model 6 Tamura-Nei model

fastadata <- read.FASTA("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/allpops75_hwe/populations.samples_rename.fasta")

# have to read in the locus lengths, because this is the only way to make sure the stats
# and the migrate infile are ordered the same
migrate_lengths <- read_lines("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/ptuberculosa.mig", skip = 1, n_max=1) %>% 
  str_split("\\t", simplify = T) %>%  t(.) %>% tibble(length=.)

# Get locus names. fasta2genotype orders them alphabetically, so do this here too
locus_names <- str_extract(names(fastadata),"L\\d+") %>% unique() %>% sort()

stats <- tibble(locus = character(), 
                length = numeric(),
                segSites = numeric(),
                nHaps = numeric(),
                nucDiv = numeric(),
                ttRatio = numeric(),
                model = character(),
                gammaShape = numeric(),
                rate1 = numeric(), 
                rate2 = numeric(), 
                rate3 = numeric(),
                rate4 = numeric(),
                Q1= numeric(),
                Q2 = numeric(),
                Q3 = numeric(),
                )
                
for(l in locus_names){
  print(paste("Now Doing Locus", l, match(l, locus_names), "out of", length(locus_names)))
  locus_dnabin <- fastadata[str_which(names(fastadata),pattern = l)]
  # convert to package formats
  locus_dnabin <- as.matrix(locus_dnabin)
  locus_gtypes <- sequence2gtypes(locus_dnabin)
  locus_phy <- phyDat(locus_dnabin)
  #create a haplotype network .. to be continued
  haps <- haplotype(locus_dnabin)
  nhaps <- length(dimnames(haps)[[1]])
  #tcs <- haploNet(haps)
  #find parameters of HKY (F84) model
  modeltest <- modelTest(locus_phy, model = c("JC","K80", "F81", "HKY","TrN"), 
                         G = T, I = F, k = 4, mc.cores = 4)
  # pick out the best model. If multiple models are tied for best, pick the simplest one
  bestmodel <- modeltest$Model[which(modeltest$BIC == min(modeltest$BIC))][1]
  #open the object environment
  env <- attr(modeltest,"env")
  bestparams <- get(bestmodel, env)
  bestparams <- eval(bestparams, env=env)
  # use this code for v3, which only has F84 (HKY)
  #HKY <- modelTest(locus_phy, model = c("HKY"), 
  #                       G = T, I = F, k = 4)
  #env <- attr(HKY, "env")
  #HKYG <- get("HKY+G", env)
  #model <- eval(HKYG, env=env)
  # calculate TiTv Ratio
  ttratio <- TiTvRatio(locus_gtypes)
  
  stats <- bind_rows(stats, tibble(locus=l, 
                          length = length(locus_dnabin[1,]),
                          segSites = length(seg.sites(locus_dnabin)),
                          nHaps = length(dimnames(haps)[[1]]),
                          nucDiv = nuc.div(locus_dnabin),
                          ttRatio =  ttratio[3],
                          model = bestmodel,
                          gammaShape = bestparams$shape,
                          rate1 = bestparams$g[1],
                          rate2 = bestparams$g[2],
                          rate3 = bestparams$g[3],
                          rate4 = bestparams$g[4],
                          Q1 = sort(unique(bestparams$Q))[1],
                          Q2 = sort(unique(bestparams$Q))[2],
                          Q3 = sort(unique(bestparams$Q))[3]
                          ))
                         
                        
}

stats <- stats[which(stats$length %in% migrate_lengths$length),]
#setdiff(stats$length, as.numeric(migrate_lengths$length))
# write_csv(stats, "./migrate/run4_locus_statistics.csv")

Write a model block

Write a space delimited textfile that can be added to the migrate data file in the format that Peter suggested.

stats <- read_csv("./migrate/run4/run4_locus_statistics.csv")
## Rows: 109 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, model
## dbl (9): length, segSites, nHaps, nucDiv, ttRatio, gammaShape, rate1, Q1, Q2
## lgl (4): rate2, rate3, rate4, Q3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# write a space delimited textfile that can be added to the migrate data file
# following Peter's suggestion here:
#https://groups.google.com/g/migrate-support/c/XjV_4jZW4RI/m/HbRWoGY6AwAJ
migrate_mutation_models <- tibble(prefix = "#$",
                                 locus=1:length(stats$locus),
                                 type = "s",
                                 model = stats$model,
                                 q1 = stats$Q2,
                                 q2 = stats$Q3)

#write_delim(migrate_mutation_models,"./migrate/run4_modelblock.txt", na="", delim = " ")
stats
## # A tibble: 109 × 15
##    locus    length segSites nHaps    nucDiv ttRatio model gammaShape rate1 rate2
##    <chr>     <dbl>    <dbl> <dbl>     <dbl>   <dbl> <chr>      <dbl> <dbl> <lgl>
##  1 L10061      411        1     2 0.0000280   Inf   F81            1     1 NA   
##  2 L1096075    388        7     8 0.000234      6   K80            1     1 NA   
##  3 L11008      472        2     6 0             1   JC             1     1 NA   
##  4 L11074      468        2     3 0.0000491     0   JC             1     1 NA   
##  5 L11315      499        4     5 0.000162      1   F81            1     1 NA   
##  6 L11796      516        1     2 0.0000239   Inf   F81            1     1 NA   
##  7 L1180       496        5     6 0.000158      1.5 F81            1     1 NA   
##  8 L12220      337        2     4 0.0000346   Inf   F81            1     1 NA   
##  9 L1257       512        3     9 0             0.5 JC             1     1 NA   
## 10 L12728      618        1     2 0.0000180     0   JC             1     1 NA   
## # ℹ 99 more rows
## # ℹ 5 more variables: rate3 <lgl>, rate4 <lgl>, Q1 <dbl>, Q2 <dbl>, Q3 <lgl>

So we have a 16 invariant loci, and the mean overall transition:transversion ratio is 0.8073394. Mean gamma shape parameter is 1, which argues for only one rate.

Parmfile

I went through and compared my 3.x parmfile line by line to the default one generated by Migrate 4.4.4, and came up with this

################################################################################
# Parmfile for Migrate 4.4.4(git:v4-series-26-ge85c6ff)-June-1-2019 [do not remove these first TWO lines]
# generated automatically on
# Fri Feb 4 2022
menu=NO
nmlength=10
datatype=SequenceData
datamodel=HKY
ttratio=1.000000

freqs-from-data=YES 

seqerror-rate={0.0001,0.0001,0.0001,0.0001}
categories=1 #no categories file specified
rates=1: 1.000000 
prob-rates=1: 1.000000 
autocorrelation=NO
weights=NO
recover=NO
fast-likelihood=NO
inheritance-scalars={1.00000000000000000000}
haplotyping=YES:no-report
population-relabel={1 2 3 4 5 6 7 8 9}
infile=../../../ptuberculosa.mig
random-seed=AUTO #OWN:410568459
title= Palythoa tuberculosa - Hawaii
progress=YES
logfile=YES:logfile.txt
print-data=NO
outfile=outfile.txt
pdf-outfile=outfile.pdf
pdf-terse=YES
use-M=YES
print-tree=NONE
mig-histogram=MIGRATIONEVENTSONLY
skyline=NO #needs mig-histogram=ALL:...
theta=PRIOR:50
migration=PRIOR:10
rate=PRIOR:50
split=PRIOR:10
splitstd=PRIOR:10
mutation=CONSTANT
analyze-loci=A
divergence-distrib=E
custom-migration={
*   0   0   0   0   d   0   0   0
0   *   *   0   0   0   0   0   0
0   *   *   *   0   0   0   0   0
0   0   *   *   *   0   0   0   0
0   0   0   *   *   *   0   0   0
0   0   0   0   *   *   *   0   0
0   0   0   0   0   *   *   *   0
0   0   0   0   0   0   *   *   *
0   0   0   0   0   0   0   *   *
}
geo=NO

updatefreq=0.200000 0.200000 0.200000 0.200000 #tree, parameter haplotype, timeparam updates
bayes-posteriorbins= 2000 2000
bayes-posteriormaxtype=TOTAL
bayes-file=YES:bayesfile
bayes-allfile=YES:bayesallfile
bayes-all-posteriors=YES:bayesallposterior
bayes-proposals= THETA METROPOLIS-HASTINGS Sampler
bayes-proposals= MIG SLICE Sampler
bayes-proposals= DIVERGENCE METROPOLIS-HASTINGS Sampler
bayes-proposals= DIVERGENCESTD METROPOLIS-HASTINGS Sampler
bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
bayes-priors= SPLIT * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-hyperpriors=NO
long-chains=1
long-inc=100
long-sample=10000
burn-in=2000  
auto-tune=YES:0.440000
assign=NO
heating=YES:1:{1.000000,1.500000,3.000000,1000000.000000}
heated-swap=YES
moving-steps=NO
gelman-convergence=No
replicate=NO
end

Test Run 1

Let’s see what happens! Started this run on Jan 22, at 10pm.

screen -S migrate_testrun

mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile

And it was finished the next morning! Prior on m was too wide, giving results like this:

Results

Wide Priors on m No bueno.

Test Run 2

Revise m prior

Revised the m prior like so (I had no window value in the original!) Order of values is min, mean, maximum, proposal window.

bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100

Panmixia

A few modifications to make a panmictic model

population-relabel={1 1 1 1 1 1 1 1 1 1}
#and
custom-migration={
*
}

Island Model

Change all parameter values to m for island model.

custom-migration={
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
}

Minor github issue. Got a little hung up at this point because I accidentally committed some bayesallfiles that were hundreds of Mb, and of course github wouldn’t let me upload those. Thought I fixed it with these instructions. Then ended up trying git-filter-repo (installed via homebrew), then ended up restoring the whole thing from the github repository. Ouch. Then it wouldn’t push until I ran git prune

Results

Model Selection

Very nice! 😃 Going to need to modify my code to read multilocus output from migrate-n.

workingDir <- "./migrate/run2"

modelMarglikes <- harvest.model.likelihoods(workingDir = workingDir)
## [1] "./migrate/run2/island"
## [1] "./migrate/run2/panmixia"
## [1] "./migrate/run2/stepping.stone"
kable(modelMarglikes, format.args = list(digits = 5))
model thermodynamic bezier.corrected harmonic
island -71972 -71120 -71813
panmixia -70400 -69492 -69655
stepping.stone -65995 -65136 -65790
results <- bfcalcs(modelMarglikes)

kable(results)
model thermodynamic bezier.corrected harmonic lbf choice modelprob
island -71971.81 -71120.19 -71812.53 -11967.82 3 0
panmixia -70400.30 -69492.18 -69654.60 -8711.80 2 0
stepping.stone -65995.43 -65136.28 -65789.54 0.00 1 1

Parameters

workingDir2 <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/migrate/testrun/run2"

winningModelDir <- file.path(workingDir2, "stepping.stone")

#this may take a minute or two
data <- read.table(file.path(winningModelDir,"bayesallfile"), header=T) 

#calculate migrants per generation
data.nm <- migrants.per.gen(data)

data %>% group_by(locus) %>% summarize()
# Split the whole list into the individual replicates, so that you will
# have a list of data frames
#data.list.1 <- split(data.nm,data$Replicate)

#split the data on locus
data.list.1 <- split(data.nm,data$Locus)
data.list.mcmc <- mcmc.list(lapply(data.list.1,mcmc))   
# Remove burnin if it wasn't removed in the run, where burnin is a proportion of the total.
# e.g. First 40% First 20,000 samples out of 50,000 or 10 million steps out of 25 million
#burnin <- 0.2
#data.list.1<-lapply(data.list.1,subset,subset=Steps>(burnin*max(data.list.1[[1]]$Steps)))

data.list.mcmc <- mcmc.list(data.list.1)
    
#convert each dataframe to an mcmc object and then convert the whole thing to an mcmc list

#collapse all replicates to one for purposes of highest posterior density
data.list.allinone <- mcmc(data=do.call("rbind",data.list.2)) 
    

summ <- summary(data.list.mcmc)
ess <- effectiveSize(data.list.mcmc)
gelman <- gelman.diag(data.list.mcmc,multivariate=F)
HPD <- HPDinterval(data.list.allinone)
    
#concatenate the stats
allstats<-cbind(summ$statistics[,1:2],HPD,ess,gelman$psrf)
#write.csv(allstats,file="codastats.csv")

Models

’Ale’a and I settled on the following initial models


#Panmixia (panmixia; 1 panmictic population)                        
population-relabel={1 1 1 1 1 1 1 1 1 1}                                
custom-migration=   *                           
                                
                                
                                
                                
                                
#n-Island (island; all populations same size, all exchange migrants)                                
population-relabel={1 2 3 4 5 6 7 8 9}                              
custom-migration=   
  Pop_Kure                  m   m   m   m   m   m   m 
    Pop_P&H                   m m   m   m   m   m   m
    Pop_MaroReef                m   m   m   m   m   m   m
    Pop_FFS                   m m   m   m   m   m   m
    Pop_Kauai                   m   m   m   m   m   m   m
    Pop_Oahu                    m   m   m   m   m   m   m
    Pop_Molokai               m m   m   m   m   m   m
    Pop_Maui                    m   m   m   m   m   m   m
    Pop_BigIsland               m   m   m   m   m   m   m
                                
                                
                                
                                
                                
                                
#Stepping-Stone (stepping.stone)                
population-relabel={1 2 3 4 5 6 7 8 9}                              
    Pop_Kure                    *   *   0   0   0   0   0   0   0
    Pop_P&H                   * *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *
                                
                                
                                
                                
                                
#Stepping-Stone with a Kure-Oahu connection (stepping.stone.KO)                             
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   *   0   0   0   0   *   0   0
    Pop_P&H                   * *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    *   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *
                                
                                
                                
                                
#NWHI vs MHI    (NWHI_MHI; two panmictic regions)                   
population-relabel={1 1 1 1 2 2 2 2 2}                              
custom-migration=   
  Pop_NWHI                  *   *                   
    Pop_MHI                   * *                   
                                
                                
                                
                                
# Stepping Stone Breaks (stepping.stones.breaks; 
#  regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations 
#  with stepping stone gene flow between them. )                                
population-relabel={1 1 2 2 3 4 5 6 7}                              
custom-migration=   
  Pop_Kure_PH                   *   *   0   0   0   0   0
    Pop_MaroReef_FFS                *   *   *   0   0   0   0
    Pop_Kauai                       0   *   *   *   0   0   0
    Pop_Oahu                        0   0   *   *   *   0   0
    Pop_Molokai                   0 0   0   *   *   *   0
    Pop_Maui                        0   0   0   0   *   *   *
    Pop_BigIsland                   0   0   0   0   0   *   *
                                
                                
                                
# Stepping Stone Breaks (stepping.stones.breaks; 
#  regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations 
#  with stepping stone gene flow between them, and a Kure-Oahu connection.)                                 
population-relabel={1 1 2 2 3 4 5 6 7}                              
custom-migration=   
  Pop_Kure_PH                   *   *   0   *   0   0   0
    Pop_MaroReef_FFS                *   *   *   0   0   0   0
    Pop_Kauai                       0   *   *   *   0   0   0
    Pop_Oahu                        *   0   *   *   *   0   0
    Pop_Molokai                   0 0   0   *   *   *   0
    Pop_Maui                        0   0   0   0   *   *   *
    Pop_BigIsland                   0   0   0   0   0   *   *
      

I also turned menu = NO, and replicate=YES:3.

Gene Flow between Oahu and Kure

There is an intriguing result of nearly one-way gene flow from Oahu to Kure. Also, the stepping.stone.breaks.KO is much better than just stepping.stone.breaks. This suggests that a divergence parameter for these two populations is in order.

Gene Flow from Oahu to Kure
Gene Flow from Oahu to Kure
Gene Flow from Kure to Oahu
Gene Flow from Kure to Oahu

What about one way gene flow?

Stepping.stone.KO.oneway

#Stepping-Stone with a *one way* Kure-Oahu ongoing gene flow connection 
# (stepping.stone.KO.oneway)                                
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration={
*   *   0   0   0   *   0   0   0
*   *   *   0   0   0   0   0   0
0   *   *   *   0   0   0   0   0
0   0   *   *   *   0   0   0   0
0   0   0   *   *   *   0   0   0
*   0   0   0   *   *   *   0   0
0   0   0   0   0   *   *   *   0
0   0   0   0   0   0   *   *   *
0   0   0   0   0   0   0   *   *
}

Adding divergence models

Peter says you can mix * with d parameters, and I tested it and it worked! Based on these results I’m going to add a model of one way ongoing migration from Oahu to Kure, and another model of divergence from Oahu to Kure about 80 years ago with no subsequent gene flow, and finally, divergence plus migration between Oahu to Kure.

stepping.stone.KO.D

Equilibrium Gene Flow, but divergence with gene flow from Oahu to Kure


#Stepping-Stone with a *one way* Kure-Oahu divergence with gene flow 
# (stepping.stone.KO.divmig)                                
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   0   0   0   0   0   D   0   0
    Pop_P&H                   0 *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *

stepping.stone.KOd

Equilibrium Gene Flow, but divergence without gene flow from Oahu to Kure

#Stepping-Stone with a *one way* Kure-Oahu divergence without gene flow 
# (stepping.stone.KO.div)   
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   0   0   0   0   0   d   0   0
    Pop_P&H                   0 *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *

stepping.stone.splitsD

North to South Divergence with migration

*   0   0   0   0   0   0   0   0
D   *   0   0   0   0   0   0   0
0   D   *   0   0   0   0   0   0
0   0   D   *   0   0   0   0   0
0   0   0   D   *   0   0   0   0
0   0   0   0   D   *   0   0   0
0   0   0   0   0   D   *   0   0
0   0   0   0   0   0   D   *   0
0   0   0   0   0   0   0   D   *

stepping.stone.splits.d

North to South Divergence without migration

*   0   0   0   0   0   0   0   0
d   *   0   0   0   0   0   0   0
0   d   *   0   0   0   0   0   0
0   0   d   *   0   0   0   0   0
0   0   0   d   *   0   0   0   0
0   0   0   0   d   *   0   0   0
0   0   0   0   0   d   *   0   0
0   0   0   0   0   0   d   *   0
0   0   0   0   0   0   0   d   *

stepping.stone.splitsD.KOsplitD

Here, population 1 isn’t born until it splits from population 6. I’m having trouble setting an individual prior for a very young divergence from 6 to 1- apparently I found a bug- Peter is fixing it.

*   0   0   0   0   d   0   0   0
0   *   0   0   0   0   0   0   0
0   D   *   0   0   0   0   0   0
0   0   D   *   0   0   0   0   0
0   0   0   D   *   0   0   0   0
0   0   0   0   D   *   0   0   0
0   0   0   0   0   D   *   0   0
0   0   0   0   0   0   D   *   0
0   0   0   0   0   0   0   D   *

Priors

And we need to add reasonable priors on divergence time to match the foundation of Naval Air Station Midway in 1941, with a standard deviation of say 20 years because it was placed under Navy control in 1903. Splitting times \(\tau\) in Migrate are in units of \(\mu\) x generations. I will assume a nuclear mutation rate of 1e-9 and a generation time of 1 year. Peter confirmed

#take the mean of the modes from one replicate of the stepping.stone.KO model
#meantheta <- mean(0.00570+0.00450+0.00450+0.00490+0.00630+0.00510+0.00570+0.00510+0.00590)
mu <- 1e-9
generation_time <- 1

mean_split <- 80*mu/generation_time
std_split <- 20*mu/generation_time
max_split <- 163 * mu/generation_time

80 years ago is 8^{-8}, and a max of 163 years (Midway discovered in 1859) which is 1.63^{-7}.

It turns out that setting individual priors on divergence isn’t working right now, so I’m just going to have to set a wide prior if there are other d or D parameters and only can set the specific prior if the Kure Oahu split is the only split in the model.

This results in adding the following two lines to the parmfiles with only the Kure Oahu split.

bayes-priors= SPLIT * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8 
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8

Unfortunately the prior above was resulting in error messages.

[1]    63427 illegal hardware instruction  /Applications/migrate/migrate-4.4.4/migrate-n

From the output, it looks like it can’t consider priors smaller than 1e-6. But even using 1e-5 or 1-e4 results in errors.

So all will use this wider prior. Also, I am using exponentially distributed priors, so the SPLITSTD is not going to be used.

bayes-priors= SPLIT * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000

Copy It Up

Copy it up, and then make a total of 10 replicate copies of the folder.

scp -r ./run4.1 ecrandall@nautilus.psu.edu:./Ptuberculosa/run4.1    
mkdir rep1
mv step* rep1
cp ../run4/ptuberculosa.mig ./ptuberculosa.mig
cp ../run4/mpi_run_migrate-n.sh ./mpi_run_migrate-n.sh
cd rep1
#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done

And Back down


rsync -av -e ssh --exclude='bayes*'  ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4.5 ./

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf"  eric@moneta.tobolab.org:~/Palythoa/run4.6 ./run4.6

# then copied them into the run4 folder with:
# the exclude statement excludes stepping.stone.KOd and stepping.stone.KO.D which didn't complete.
rsync -av --progress run4.5/ run4/ --exclude '*KO[\.d]*'  

rsync -av --progress run4.6/ run4/

Bash Script

So we will do 10 replicates of 3 replicates. This will start at r1, and run all models for that before moving on. Pretty sure this will finish one whole model before moving on to the next one (since all threads are being used for different loci)

#!
for r in */
    do
        cd $r
        echo $r
        date
        date > date.txt
            for m in */
              do
                cd $m
                  date > date.txt
                  echo $m
                  date
                  mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile
                  sleep 1
                cd ..
              done
        cd ..
    done

Results

Model Marginal Likelihoods

runDir <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Documents/Research/Ptuberculosa/migrate/run4"
#runDir <-"/Volumes/GoogleDrive-103325533945714033178/My Drive/ptuberculosa/migrate/run4"

likelist <- list()
for(r in 1:10){
  rep = paste0("rep",r)
  print(rep)
  likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}

# Model selection for each replicate...
likelist %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 10 replicates:

like.df <-  likelist %>% bind_rows() %>% group_by(model)

means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model <- bfcalcs(means)

final_model
##                              model bezier.corrected       lbf choice modelprob
## 1                         NWHI_MHI        -72281.30 -35630.64      8         0
## 2                           island        -75311.81 -41691.66      9         0
## 3                         panmixia        -72172.88 -35413.80      7         0
## 4                   stepping.stone        -71256.23 -33580.50      6         0
## 5                stepping.stone.KO        -69254.51 -29577.06      3         0
## 6              stepping.stone.KO.D        -55474.16  -2016.36      2         0
## 7               stepping.stone.KOd        -54465.98      0.00      1         1
## 8            stepping.stone.breaks        -71110.23 -33288.51      5         0
## 9         stepping.stone.breaks.KO        -70307.17 -31682.39      4         0
## 10         stepping.stone.splits.d        -77896.08 -46860.21     12         0
## 11          stepping.stone.splitsD        -77091.94 -45251.93     11         0
## 12 stepping.stone.splitsD.KOsplitd        -76884.10 -44836.24     10         0

Figures

And some figures summarizing all this. The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The three models with divergences are much worse than those without. Difference between stepping.stone and stepping.stone.KO is very significant.

models <- c("N-Island", "NWHI vs. MHI"," Panmixia", "SS: Islands",
            "SS: Regional Breaks", "SS: Regional Breaks, KO 2way",
            "SS: Islands, OK 2way", 
            "SS: Islands, O2K Colonization with GF",
            "SS: Islands, O2K Colonization without GF",
            "Seq Div: No GF","Seq Div: GF",
            "Seq Div: GF, O2K Colonization without GF")

model_letters <- c("B", "C"," A", "D","G", "H",
                    "F", "L", "M", "I","J","K")
model_letters <- rep(model_letters,10)

likesPlot <- likelist %>% bind_rows() %>% 
              group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
              scale_x_discrete(labels = models) +
              labs(x = "Metapopulation Model", 
                   y = "Bezier Corrected Marginal Likelihood")
likesPlot

likesPlot_letters <- likelist %>% bind_rows() %>% 
                      bind_cols(model_letters=model_letters) %>% 
                      group_by(model_letters) %>% 
                      ggplot(mapping = aes(x=model_letters, y = bezier.corrected)) +
                      geom_violin(draw_quantiles = 0.5) +
                      labs(x = "Metapopulation Model", 
                      y = "Bezier Corrected Marginal Likelihood")

ggsave("figures/final_figures/MarginalLikelihoodsPlot.pdf", likesPlot_letters)
## Saving 7 x 5 in image

Rerun the three top models

Sigh. I figured out that I had a ridiculously high Ti:Tv rate (\(\kappa\)) for 3 loci that had K80 models. This might have affected the summed marginal likelihood of each model just a bit, and definitely could affect parameter estimation, so I am going to rerun the three top models (stepping.stone.KOd, stepping.stone.KO.D, stepping.stone.KO) using a new version of migrate that doesn’t have bayesfile visualization issues, and with the ti:tv ratio fixed on those three loci.

Final Model Selection Results

Model Marginal Likelihoods

runDir <-"/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Documents/Research/Ptuberculosa/migrate/run4.7"

likelist2 <- list()
for(r in 1:10){
  rep = paste0("rep",r)
  print(rep)
  likelist2[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}

# Model selection for each replicate...
likelist2 %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 10 replicates:

like2.df <-  likelist2 %>% bind_rows() %>% group_by(model)

means2 <- like2.df %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model2 <- bfcalcs(means2)

final_model2
##                 model bezier.corrected        lbf choice    modelprob
## 1   stepping.stone.KO        -65211.06 -34149.632      3 0.000000e+00
## 2 stepping.stone.KO.D        -48214.14   -155.794      2 1.478301e-34
## 3  stepping.stone.KOd        -48136.25      0.000      1 1.000000e+00

T-Test

The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The difference between the top two models is significant. 🙋

top.choice <- final_model2$model[which(final_model2$choice ==1)]
second.choice <- final_model2$model[which(final_model2$choice ==2)]

TS <- permTS(like2.df$bezier.corrected[which(like2.df$model == top.choice)],
       like2.df$bezier.corrected[which(like2.df$model == second.choice)],
       alternative = "greater")

TS

Figures

And some figures summarizing all this. The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The three models with divergences are much worse than those without. Difference between stepping.stone and stepping.stone.KO is very significant.

models <- c("SS: Islands, O2K Colonization with GF", 
            "SS: Islands, O2K Colonization without GF")

model_letters2 <- c("L","M")

likesPlot2 <- likelist2 %>% bind_rows() %>% filter(model != "stepping.stone.KO") %>% 
              group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
              scale_x_discrete(labels = models) +
              labs(x = "Metapopulation Model", 
                   y = "Bezier Corrected Marginal Likelihood") +
              ylim(-50000,-45000)
likesPlot2

likesPlot2_letters <- likelist2 %>% bind_rows() %>% filter(model != "stepping.stone.KO") %>%
              group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              scale_x_discrete(labels = model_letters2) +
              labs(x = "Metapopulation Model", 
                   y = "Bezier Corrected Marginal Likelihood") +
              ylim(-50000,-45000)

likesPlot2

Model Parameter Estimates

Because I set such wide priors on gene flow for marine species, the standard PDF output does a poor job of binning the data, such that even though an equilibrium model of gene flow is the best model, Migrate is telling me that all migration parameters have a mode of 2.5 and 0 at the 25th percentile. It would be nice if I could look at the posterior calculated across all loci myself, but it is unclear how Migrate does this exactly. I inquired about this to Peter, and he got back to me with this method for summing the posterior over all loci and some Python code as a demo.

  1. create a histogram for each locus
  2. log the histogram (there will be issues with zero value cells) *. migrate smoothes each locus histogram
  3. subtract the log(prior)
  4. sum all log histograms for each locus
  5. add the log prior back
  6. convert to non-log
  7. adjust so that the sum of the new histogram sums t0 1.0

Read in the posteriors and make a labeling key

#
Dir <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Documents/Research/Ptuberculosa/migrate/run4.7"
      
winningModel <- "stepping.stone.KO.D"

posteriors <- list()

#read in the posterior for one replicate this may take 5 minutes
for(r in 1:2){
  print(paste("Loading Rep",r))
  posteriors[[r]] <- read.table(file.path(Dir,paste0("rep",r),winningModel,
                                      "bayesallfile"), header=T) 
}
## [1] "Loading Rep 1"
## [1] "Loading Rep 2"
#combine into one, and then keep only the first 2 reps (based on diagnostics below)
posteriors12 <- posteriors %>% bind_rows(.id = "rep") %>% 
                                filter(rep <= 2) %>% migrants.per.gen()

posteriors12_long <- posteriors12 %>% pivot_longer(starts_with(c("Theta_", "M_", 
                                                           "Nm_","D_","d_")),
                                                    names_to = "parameter",
                                                    values_to = "value") 


# read in a population key to translate numbers into names in figures
popkey <- read_csv("popkey.csv")
## Rows: 9 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Pop
## dbl (1): Index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(popkey$Pop) <- popkey$Index
#make a new labeling key
posterior_parameters <- names(posteriors12) %>% str_subset("Theta|M_|Nm_|D_|d_")

parameter_key <- posterior_parameters %>% 
                                  str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
                                  str_replace("M_", "m/mu " ) %>% 
                                  str_replace("Theta_", "theta ") %>% 
                                  str_replace_all(popkey$Pop) %>% 
                                  str_replace("Nm_", "4Nem ") 
                                  

names(parameter_key) <- posterior_parameters

Check Convergence

# posteriors.grouped <- posteriors %>% 
#                       select(all_of(c("Locus",
#                                         "Steps",
#                                         posterior_parameters))) %>% 
#                       group_by(rep)

# posterior.reps <- group_split(posterior.grouped)
# posterior.key <- group_keys(posterior.grouped)

#thin by taking every other step
posteriors2 <- lapply(posteriors,filter,Steps%%500==0)
# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_", 
                                                          "Nm_","D_","d_")))

                        
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)
effectiveSize(posteriors2[[1]])
#ggmcmc(posterior.ggs, param_page = 4)
Effective Sizes

Effective sizes are good except for M_3_4.

effectiveSize(posteriors2[[1]]) (first replicate)

     Theta_1      Theta_2      Theta_3      Theta_4      Theta_5      Theta_6      Theta_7      Theta_8 
127773.10928  60095.43078  58552.43846  71943.00597  77691.51114  61662.55359  45230.02725  64583.65543 
     Theta_9        M_6_1        M_3_2        M_2_3        M_4_3        M_3_4        M_5_4        M_4_5 
 79323.26808 147192.07123  67813.55644    448.34109   1473.22013     68.48918   3983.63608  39822.63084 
       M_6_5        M_5_6        M_7_6        M_6_7        M_8_7        M_7_8        M_9_8        M_8_9 
140352.48682 316433.45752 333193.44720  63909.27617  63612.89938 653891.00000  38723.33119 441357.70803 
       D_6_1 
202634.81598

effectiveSize(posteriors.mcmc[c(1,2)])
  Theta_1   Theta_2   Theta_3   Theta_4   Theta_5   Theta_6   Theta_7   Theta_8   Theta_9     M_6_1 
268101.80 175117.21  86454.19  84954.50  91454.47 134416.13 127731.50 110109.44 174394.67 313435.43 
    M_3_2     M_2_3     M_4_3     M_3_4     M_5_4     M_4_5     M_6_5     M_5_6     M_7_6     M_6_7 
674097.79 236556.17 421717.30 218933.72 390887.59 272926.62 585871.37 589532.54 613794.87 495933.81 
    M_8_7     M_7_8     M_9_8     M_8_9     D_6_1 
421019.97 667326.66  39515.35 531464.72 415678.34
Gelman-Rubin Diagnostic

Gelman-Rubin diagnostic is a bit high for M_ parameters, but if I log transform them, all Gelman diagnostics are below 1.2.


gelman.diag(posteriors.mcmc)

Potential scale reduction factors:

        Point est. Upper C.I.
Theta_1       1.00       1.00
Theta_2       1.01       1.02
Theta_3       1.00       1.01
Theta_4       1.00       1.01
Theta_5       1.00       1.00
Theta_6       1.00       1.01
Theta_7       1.01       1.02
Theta_8       1.00       1.00
Theta_9       1.01       1.01
M_6_1         1.00       1.00
M_3_2         1.20       1.25
M_2_3         1.29       1.34
M_4_3         1.28       1.34
M_3_4         1.29       1.82
M_5_4         1.12       1.15
M_4_5         1.06       1.06
M_6_5         1.04       1.04
M_5_6         1.05       1.05
M_7_6         1.01       1.01
M_6_7         1.20       1.20
M_8_7         1.01       1.01
M_7_8         1.17       1.17
M_9_8         1.24       1.40
M_8_9         1.28       1.30
D_6_1         1.00       1.00

Multivariate psrf

1.03

gelman.diag(posteriors.mcmc, transform = T)
Potential scale reduction factors:

        Point est. Upper C.I.
Theta_1       1.00       1.00
Theta_2       1.00       1.01
Theta_3       1.00       1.00
Theta_4       1.00       1.01
Theta_5       1.00       1.00
Theta_6       1.00       1.01
Theta_7       1.00       1.01
Theta_8       1.00       1.00
Theta_9       1.00       1.01
M_6_1         1.00       1.00
M_3_2         1.01       1.03
M_2_3         1.01       1.03
M_4_3         1.01       1.02
M_3_4         1.02       1.04
M_5_4         1.01       1.02
M_4_5         1.00       1.01
M_6_5         1.00       1.01
M_5_6         1.01       1.02
M_7_6         1.00       1.01
M_6_7         1.00       1.01
M_8_7         1.00       1.00
M_7_8         1.01       1.01
M_9_8         1.01       1.01
M_8_9         1.00       1.01
D_6_1         1.00       1.00

Multivariate psrf

1.02

Theta plots

Here are plots of each parameter posterior, with each colored line being the posterior from one of 106 loci. Next thing I have to do is code up the summation described by Peter above.

\(\theta=4N_e\mu\)

Theta_plot <- posteriors12_long %>%  
              filter(str_detect(parameter, "Theta_")) %>%  
              ggplot() + 
              geom_density(mapping = aes(x = value, 
                    group=Locus, color =Locus), weight = 0.5) + 
                    #scale_x_log10(limits=c(1e-4,0.1)) +
                    facet_wrap(~parameter, 
                    scales = "free_y",
                    labeller = labeller(parameter= parameter_key), 
                           dir = "h",nrow=2) +
                    theme(axis.text.x = element_text(size = rel(0.6)))

Theta_plot

Migration plots

Proportion of migrants relative to mutation rate \(\frac{m}{\mu}\)

M_plot <- posteriors12_long %>%  
          filter(str_detect(parameter, "M_")) %>%  
          ggplot() + 
          geom_density(mapping = aes(x = value, 
                       group=Locus, color =Locus), weight = 0.5) + 
          scale_x_log10(limits=c(1e-4,1000)) + 
          facet_wrap(~parameter, scales = "free_y", 
                    labeller = labeller(parameter= parameter_key), 
                    dir = "v",nrow=2) +
          theme(axis.text.x = element_text(size = rel(0.6)))
M_plot

Gene Flow plots

Migrants per generation \(4N_em\)

Nm_plot <- posteriors12_long %>%  
          filter(str_detect(parameter, "Nm_")) %>%  
          ggplot() + 
          geom_density(mapping = aes(x = value, group=Locus, color
                                     =Locus), weight = 0.5) + 
                    scale_x_log10(limits=c(1e-6,10)) + 
                    facet_wrap(~parameter, scales = "free_y",
                    labeller = labeller(parameter= parameter_key), 
                    dir = "v",nrow=2) +
           theme(axis.text.x = element_text(size = rel(0.6)))

Nm_plot

Sum over loci

Priors

Here are the priors I’m using, as a reminder.

bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
bayes-priors= SPLIT * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000

Two Loci

Here are the basics for doing it over two loci:

# make density objects for two loci (n must be a power of 2, using 2^14)
testdens1 <- density(posterior$Theta_1[which(posterior$Locus==1)], n=16384, 
                     from = 0, to = 0.1, bw = "SJ")
testdens2 <- density(posterior$Theta_1[which(posterior$Locus==2)], n=16384, 
                     from = 0, to = 0.1, bw = "SJ")

#create the prior
thetaprior <- dexp(testdens1$x, rate = 0.001, log = F)

#create an empty tibble
densum <- tibble(.rows=16384)
densum$x <- testdens1$x
#remove the prior and standardize
densum$l1 <- (testdens1$y/ thetaprior) / sum(testdens1$y/ thetaprior)
densum$l2 <- (testdens2$y/ thetaprior) / sum(testdens2$y/ thetaprior)
#sum over loci and standardize again
densum$sum <- exp(log(densum$l1+1e-20) + log(densum$l2 + 1e-20) + 
                    log(thetaprior)) / sum( exp(log(densum$l1+1e-20) + 
                                                  log(densum$l2 + 1e-20) + 
                                                  log(thetaprior)))
#pivot to long format
densum_long <- pivot_longer(densum,cols=2:4,names_to = "yplot", values_to = "y")
#plot
ggplot(densum_long, aes(x = x, y = y, group = yplot, color = yplot)) + 
  geom_line() + scale_x_log10()

#bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000 
#bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100

Many Loci

Functions for Summarizing from Bayesfile

These functions are for reconstructing distributions from the bayesfile which consists of distributions for each locus calculated as histograms

bayesfilenames <- c("Locus","pnumber","HPC50","HPC95","value","count",
                    "freq","cumFreq","priorFreq")
DirB <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Documents/Research/Ptuberculosa/migrate/run4.6"
bf <- read_table(file = file.path(DirB,paste0("rep",1),winningModel,
                                      "bayesfile"), 
                             col_types = "ifiiddddd", col_names = bayesfilenames,
                             comment = "#", skip_empty_rows = T)

parameter_number_table <- read_table(file = file.path(DirB,paste0("rep",1),winningModel,
                                      "bayesfile"), 
                             col_types = "cic", 
                             col_names = c("drop","pnumber","parameter"),
                             skip=9, n_max = 24) %>% 
                      select(-drop) %>%
                      mutate(parameter = str_replace_all(parameter, ",","_")) %>% 
                      mutate(parameter = str_remove_all(parameter,"\\(")) %>% 
                      mutate(parameter = str_remove_all(parameter,"\\)")) %>% 
                      left_join(tibble(parameter_name =
                                        parameter_key,parameter=names(parameter_key)),
                                by = "parameter")

parameter_number_key <- parameter_number_table$parameter_name 
names(parameter_number_key) <- parameter_number_table$pnumber

#bf %>% ggplot(aes(x=value, y = freq, color = Locus)) + 
#  geom_line() + scale_color_continuous() + 
#  facet_wrap(vars(Parameter), scales = "free")

# bf %>% filter(pnumber %in% 1:9) %>% filter(Locus %in% 1:109) %>% 
#       ggplot(aes(x=value, y = freq, color = Locus)) + geom_line() +
#       facet_wrap(vars(pnumber), scales = "free")
bf %>% filter(pnumber %in% 1:9) %>% filter(Locus == 110) %>% 
      ggplot(aes(x = value, y = freq, color = pnumber)) + geom_line() +
      scale_color_brewer(palette = "YlGnBu", labels = parameter_number_key) +
      ylim(0,0.5) + xlim(0,0.01) + geom_line(aes(x = value, y = priorFreq),
                                             color = "grey", linetype="dotted")
Functions for Summarizing from Bayesallfile

These functions are for reconstructing distributions from the “raw” posterior output, which Peter calls a bayesallfile.

remove_prior <- function(densityd,prior,threshold = 1e-10){
  # this function changes values less than a threshold 
  # to zero removes a prior from the 
  # y values of a density distributions (density).
  # first it zeros values less than a threshold, then 
  # removes the prior, then renormalizes so the density
  # sums to 1
  prior[which(prior < 1e-30)] <- 1e-30
  densityd[which(densityd < threshold)] <- 0
  new <- (densityd/prior)/sum(densityd/prior)

  return(new)
}

sum_over_loci <- function(df,parameter){
      #this function takes a data frame of probability densities for many loci
      # that have had the prior removed,
      # together with with a logged prior named "logPrior",
      #  as well as the name of a parameter (e.g. "Theta_1")
      # and sums the densities over loci.
       # Rmpfr package allows quadruple precision for calcs on very small numbers.
    require(Rmpfr)
  
    #add a teeny-tiny amount to all values to avoid zeros
    df2 <- df %>% mutate(across(starts_with(parameter), 
                        .fns= function(x) x + 1e-11))  %>% 
      #log all values
            mutate(across(starts_with(c(parameter)),
                  .fns=log)) %>% 
      # convert the df to rowwise so that rows can be summed
      # and then sum across the row, including the prior
          rowwise() %>% 
          mutate(sum_param_prior = 
          sum(c_across(starts_with(c(parameter,"logPrior"))))) %>% 
    #convert back to a regular df
          ungroup()
    
    #need to convert to quadruple precision because 
    #these will exponentiate to very small numbers.
    sum_param_prior_exp <- exp(mpfr(df2$sum_param_prior, precBits = 128))
    # standardize by dividing by the sum of the column
    sum_param_prior_standardized <-
             sum_param_prior_exp/sum(sum_param_prior_exp)
    

    #drop the intermediate columns (no longer needed), change the standardized
    # output back to double precision so that it can be incorporated into the df
    # rename the summed column after the parameter
      df3 <- df2 %>% select(-c(sum_param_prior)) %>%
          mutate(summed_over_loci =
          as.numeric(sum_param_prior_standardized)) %>% 
          rename_with(.fn = ~ paste(parameter), 
                      .cols = summed_over_loci)
    return(df3)
}



summarize_posterior <- function(posterior, 
                  parameter_type = c("Theta","M","Nm","D"),
                  prior_params = c(min,mean,max),
                  n=16384){
  # this function takes a Migrate-n posterior "bayesallfile" as a dataframe
  # as well as one of the parameter types, and the prior on that parameter
  # as a vector c(min,mean,max). Currently only exponential priors supported
  # it will create densities for each parameter of the given type,
  # remove the prior from each, sum across loci, and re-add the prior (once)
  parameters <- names(posterior) %>%
                        str_subset(parameter_type)
  # create a tibble with the x values for all density plots 
  #  for a particular parameter type
  getx <- posterior %>% filter(Locus == 1) %>% 
        select(parameters[1])

  # create a tibble with x values for a density plot
  #  of the chosen number of points
  dens <- tibble(.rows = n, x = density(getx[,1],  n=n, 
                           from = prior_params[1], 
                           to = prior_params[3],
                           bw = "nrd0")$x)
  print("calculating densities")
  # calculate densities for each parameter of a given type at each locus
  dens <- posterior %>% 
          select(starts_with(c("Steps","Locus","rep",
                  paste0(parameter_type,"_")))) %>% 
          pivot_wider(names_from = "Locus", values_from = 
                  starts_with(paste0(parameter_type,"_")),
                          names_sep = "-") %>% 
          select(starts_with(paste0(parameter_type,"_"))) %>% 
          map_dfc(function(x) density(x, n = n, from = 
                                    prior_params[1], 
                                    to = prior_params[3], 
                                    bw = "nrd0")$y) %>%
          bind_cols(dens)
  
  # create, standardize, log and remove prior
  dens$prior <- dexp(dens$x, rate = 1/prior_params[2], 
                     log = F)
  
  dens$prior <- dens$prior/sum(dens$prior)
  
  dens$logPrior <- log(dens$prior)
  
  print("removing prior")
  dens2 <- dens %>% 
        #remove the prior, standardize
        mutate(across(starts_with(parameter_type), 
                  ~ remove_prior(densityd = .x,
                                  prior = dens$prior,
                                  threshold = 1e-10) ))

  dens3 <- dens2
    
  for(p in parameters){
    print(p)  
    dens3 <- sum_over_loci(df = dens3, parameter = p)
  }
  # trying to do the above loop with purrr:map_dfc
  #dens4 <- parameters %>% 
  #        map_dfc(.f = ~ sum_over_loci(df = dens2, parameter = .x))
  return(dens3)
}

posterior_stats <- function(df,parameter){
  require(spatstat.explore)
  p <- df %>% select(x,parameter) %>% as.list(bw = 6.33e-05)
  names(p) <- c("x", "y")
  p$bw <- 6.33e-5
  attr(p, "class") <- "density"
  qu <- quantile.density(p, c(0.025, 0.25, 0.5, 0.75, 0.975))
  wmo <- p$x[which(p$y==max(p$y))]
  wme <- weighted.mean(p$x, p$y)
  wsd <- sqrt(weighted.var(p$x, p$y))
  stats <- c(qu,mode = wmo, mean = wme, sd = wsd)
  return(stats)
}

Final Parameter Estimates

These are the final estimates, summed over loci.

Stepping-Stone_KOD Model

For the model with ongoing gene flow between Kure and Oahu

Theta Estimates

And here I run the code for the \(\theta\) estimates

parameter_type <- "Theta"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
thetas <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,0.001,0.1))

#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

theta.colors <- rev(brewer.pal(9,"Spectral"))
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)

#plot
thetas_density <-  ggplot(thetas_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = theta.colors, 
                             labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.175) + xlim(0,0.01)

thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\Theta$")) + xlim(0,0.01)

thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x, 
                                          violinwidth = Density*10, 
                                          fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(y = TeX("$\\Theta$")) + ylim(0,0.01)


# get stats for each summed-over-loci posterior

theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas, 
                                                             parameter = .x),
                                      .id = "parameter")

Theta estimates:

thetastats <- read_csv("./tables/stepping.stone.KO.D/theta_stats.csv")
## Rows: 9 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
thetastats
## # A tibble: 9 × 9
##   parameter      `2.5%`   `25%`   `50%`   `75%` `97.5%`    mode    mean       sd
##   <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
## 1 theta Kure    0.00543 0.00559 0.00568 0.00575 0.00596 0.00569 0.00567  1.61e-4
## 2 theta P&H     0.00112 0.00123 0.00129 0.00136 0.00148 0.00128 0.00130  9.25e-5
## 3 theta Maro    0.00674 0.00698 0.00699 0.00700 0.00701 0.00699 0.00698  5.73e-5
## 4 theta FFS     0.00621 0.00678 0.00680 0.00682 0.00687 0.00680 0.00678  1.37e-4
## 5 theta Kauai   0.00797 0.00801 0.00803 0.00804 0.00807 0.00803 0.00802  2.73e-5
## 6 theta Oahu    0.00762 0.00765 0.00767 0.00768 0.00772 0.00767 0.00767  2.62e-5
## 7 theta Molokai 0.00678 0.00684 0.00686 0.00689 0.00775 0.00685 0.00693  2.44e-4
## 8 theta Maui    0.00771 0.00779 0.00783 0.00785 0.00789 0.00784 0.00781  4.82e-5
## 9 theta Big Is. 0.00632 0.00638 0.00640 0.00643 0.00649 0.00640 0.00639  1.10e-4

Theta Density Theta Ridges

Theta Violins
Theta Violins

m/mu estimates

parameter_type <- "M"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
Ms <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,1,1000), n = 2^14)

#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
#                filter(Parameter != "M_6_1")

migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
Ms_density <-  ggplot(Ms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        scale_x_log10(limits = c(1e-2,100)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted")
  
           
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_x_log10(limits = c(1e-2,100)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\frac{m}{\\mu}$"))

  
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") +
        scale_y_log10(limits = c(1e-2,100)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) 
# get stats for each summed-over-loci posterior

Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms, 
                                                             parameter = .x),
                                      .id = "parameter")

M/mu estimates:

mmustats <- read_csv("./tables/stepping.stone.KO.D/Mmu_stats.csv")
## Rows: 15 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mmustats
## # A tibble: 15 × 9
##    parameter            `2.5%`  `25%`  `50%`  `75%` `97.5%`   mode   mean     sd
##    <chr>                 <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1 m/mu Oahu to Kure    66.3   66.7   67.3   68.3    71.6   66.3   67.7   1.46  
##  2 m/mu Maro to P&H      1.16   1.16   1.16   1.28    1.40   1.16   1.21  0.143 
##  3 m/mu P&H to Maro      1.10   1.16   1.22   1.22    1.28   1.16   1.20  0.0680
##  4 m/mu FFS to Maro      0.244  0.244  0.244  0.244   0.305  0.244  0.249 0.0435
##  5 m/mu Maro to FFS      1.10   1.22   1.28   1.34    1.46   1.28   1.28  0.117 
##  6 m/mu Kauai to FFS     0.549  0.671  0.732  0.794   0.916  0.671  0.724 0.104 
##  7 m/mu FFS to Kauai     0.122  0.183  0.183  0.183   0.244  0.183  0.174 0.0461
##  8 m/mu Oahu to Kauai    0.183  0.183  0.244  0.488   0.610  0.183  0.308 0.183 
##  9 m/mu Kauai to Oahu    0.305  0.427  0.488  0.488   0.610  0.488  0.465 0.0897
## 10 m/mu Molokai to Oahu  0.183  0.244  0.244  0.244   0.305  0.244  0.236 0.0499
## 11 m/mu Oahu to Molokai  0.366  0.427  0.427  0.488   0.549  0.427  0.454 0.0632
## 12 m/mu Maui to Molokai  1.04   1.22   1.28   1.40    1.65   1.28   1.30  0.149 
## 13 m/mu Molokai to Maui  0.183  0.183  0.244  0.244   0.244  0.244  0.217 0.0439
## 14 m/mu Big Is. to Maui  0.732  0.916  1.10   1.16    1.22   1.16   1.04  0.173 
## 15 m/mu Maui to Big Is.  1.95   1.95   1.95   2.01    2.01   1.95   1.98  0.0454

Mmu Density Mmu Ridges

Mmu Violins
Mmu Violins

4Nem estimates

parameter_type <- "Nm"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
nms <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,1,1), n = 2^17)

#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
                filter(Parameter != "Nm_6_1")

migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
Nms_density <-  ggplot(nms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.125) + xlim(0,0.003)
  
           
#plot
nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$4N_em$")) +
        xlim(0,0.003)
  
nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        ylim(0,0.003)
# get stats for each summed-over-loci posterior

nms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms, 
                                                             parameter = .x),
                                      .id = "parameter")

\(4N_em\) estimates:

nmstats <- read_csv("./tables/stepping.stone.KO.D/4Nm_stats.csv")
## Rows: 15 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nmstats
## # A tibble: 15 × 9
##    parameter      `2.5%`   `25%`   `50%`   `75%` `97.5%`    mode    mean      sd
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 4Nem Oahu to… 9.09e-1 9.66e-1 9.83e-1 9.93e-1 9.99e-1 1   e+0 9.75e-1 2.46e-2
##  2 4Nem Maro to… 3.66e-4 4.27e-4 4.58e-4 4.96e-4 5.72e-4 4.50e-4 4.64e-4 5.47e-5
##  3 4Nem P&H to … 6.03e-4 6.71e-4 7.17e-4 7.63e-4 8.54e-4 7.10e-4 7.19e-4 6.61e-5
##  4 4Nem FFS to … 1.98e-4 2.21e-4 2.37e-4 2.59e-4 2.98e-4 2.37e-4 2.41e-4 2.66e-5
##  5 4Nem Maro to… 1.35e-3 1.52e-3 1.62e-3 1.72e-3 1.94e-3 1.61e-3 1.63e-3 1.51e-4
##  6 4Nem Kauai t… 6.49e-4 7.40e-4 7.93e-4 8.47e-4 9.69e-4 7.86e-4 7.97e-4 8.23e-5
##  7 4Nem FFS to … 8.70e-4 1.04e-3 1.14e-3 1.27e-3 1.54e-3 1.12e-3 1.16e-3 1.73e-4
##  8 4Nem Oahu to… 1.49e-3 1.74e-3 1.87e-3 2.01e-3 2.27e-3 1.86e-3 1.87e-3 2.01e-4
##  9 4Nem Kauai t… 1.08e-3 1.23e-3 1.31e-3 1.40e-3 1.57e-3 1.30e-3 1.31e-3 1.26e-4
## 10 4Nem Molokai… 9.99e-4 1.17e-3 1.26e-3 1.36e-3 1.56e-3 1.24e-3 1.26e-3 1.43e-4
## 11 4Nem Oahu to… 7.10e-4 8.24e-4 8.93e-4 9.61e-4 1.11e-3 8.85e-4 8.97e-4 1.03e-4
## 12 4Nem Maui to… 1.46e-3 1.65e-3 1.75e-3 1.86e-3 2.08e-3 1.75e-3 1.76e-3 1.61e-4
## 13 4Nem Molokai… 4.35e-4 5.04e-4 5.42e-4 5.80e-4 6.56e-4 5.34e-4 5.40e-4 5.67e-5
## 14 4Nem Big Is.… 1.26e-3 1.40e-3 1.47e-3 1.56e-3 1.75e-3 1.46e-3 1.48e-3 1.26e-4
## 15 4Nem Maui to… 1.17e-3 1.39e-3 1.52e-3 1.66e-3 2.05e-3 1.52e-3 1.54e-3 2.22e-4

4Nem Density 4Nem Ridges

4Nem Violins ##### 4NeM Estimate for Oahu - Kure

This parameter is orders of magnitude larger than the others, and effectively can’t be summarized under the same prior limits. So this code is just rejigged to focus on the Oahu-Kure parameter

parameter_type <- "Nm"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
nms61 <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(1,1,1000), n = 2^14)

#convert to long format
nms_61long <- nms61 %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
                filter(Parameter == "Nm_6_1")

migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
Nms_61density <-  ggplot(nms_61long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.125) + xlim(0,100)
  
           
#plot
nm_61ridges <- ggplot(nms_61long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$4N_em$")) +
        xlim(0,50)
  
nm_61violins <- ggplot(nms_61long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        ylim(0,50)
# get stats for each summed-over-loci posterior

nms_61stats <-posterior_stats(df = nms61, parameter = "Nm_6_1")
nm61stats <- read.csv("./tables/stepping.stone.KO.D/4Nm_Oahu_Kure.csv")
nm61stats
##       X          x
## 1  2.5% 34.6597693
## 2   25% 34.7207471
## 3   50% 34.7207471
## 4   75% 34.7207471
## 5 97.5% 38.1355063
## 6  mode 34.7207471
## 7  mean 34.7966844
## 8    sd  0.8936556

Divergence Estimate Oahu-Kure

This is the divergence

dens <- tibble(.rows = 2^15, x = density(getx[,1],  n=2^15, 
                                      from = 0, 
                                      to = 0.0001,
                                      bw = "nrd0")$x)

dens2 <- posteriors12 %>%
          select(starts_with(c("Steps","Locus","rep",
                paste0(parameter_type,"_")))) %>%
          pivot_wider(names_from = "Locus", values_from =
                      starts_with(paste0(parameter_type,"_")),
                      names_sep = "-") %>%
          select(4:112) %>%
          map_dfc(function(x) density(x, n = 2^15, from = 0, to = 0.001, bw = "nrd0")$y)

dens2<-bind_cols(dens,dens2)

dens3$prior <- dexp(dens3$x, rate = 1/0.001,log = F)
dens3$prior <- dens3$prior/sum(dens3$prior)
dens3$logPrior <- log(dens3$prior)


dens4 <- dens3 %>%
        #remove the prior, standardize
        mutate(across(starts_with(parameter_type),~ remove_prior(densityd = .x,
                                                                 prior = dens$prior,threshold = 1e-10) ))
names(dens4)[1:109] <- paste0("D_6_1-",names(dens4)[1:109])

dens5 <- sum_over_loci(df = dens4, parameter = "D_6_1")


#convert to long format
D_61long <- dens5 %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
                filter(Parameter == "D_6_1")

migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
D_61density <-  ggplot(D_61long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-5,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.02) + xlim(0,0.001)
  
           

# get stats for each summed-over-loci posterior

D_61stats <-posterior_stats(df = dens5, parameter = "D_6_1")
Divergence Density
Divergence Density
D61stats <- read.csv("./tables/stepping.stone.KO.D/D_6_1_stats.csv")
D61stats
##       X            x
## 1  2.5% 5.249184e-04
## 2   25% 5.722221e-04
## 3   50% 5.996887e-04
## 4   75% 6.271554e-04
## 5 97.5% 6.851405e-04
## 6  mode 5.951109e-04
## 7  mean 6.006982e-04
## 8    sd 4.126898e-05

Divergence time in coalescent units following along on page 25 here. Beerli et al. (2019)

tau <- 5.951109e-04 / (0.00767+0.00569)
tau
## [1] 0.04454423
t <- tau / 1.2e-8

Stepping-Stone KO.d model

This is the same model, without ongoing gene flow between Kure and Oahu

Read in the posteriors and make a labeling key

#
Dir <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4.7"
      
winningModel <- "stepping.stone.KOd"

posteriors <- list()

#read in the posterior for two replicates this may take 5 minutes
for(r in 1:5){
  print(paste("Loading Rep",r))
  posteriors[[r]] <- read.table(file.path(Dir,paste0("rep",r),winningModel,
                                      "bayesallfile"), header=T) 
}

posteriors <- lapply(posteriors, select, starts_with(c("Locus","Steps","Theta_", "M_", 
                                                          "Nm_","D_","d_")))



#combine into one, and then keep only the first 2 reps (based on diagnostics below)
posteriors12 <- posteriors %>% bind_rows(.id = "rep") %>% 
                                 migrants.per.gen()  %>% mutate(rep = as.integer(rep))




#posteriors12_long <- posteriors12 %>% pivot_longer(starts_with(c("Theta_", "M_", 
#                                                           "Nm_","D_","d_")),
#                                                    names_to = "parameter",
#                                                    values_to = "value") 


# read in a population key to translate numbers into names in figures
popkey <- read_csv("popkey.csv")
names(popkey$Pop) <- popkey$Index
#make a new labeling key
posterior_parameters <- names(posteriors15) %>% str_subset("Theta|M_|Nm_|D_|d_")

parameter_key <- posterior_parameters %>% 
                                  str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
                                  str_replace("M_", "m/mu " ) %>% 
                                  str_replace("Theta_", "theta ") %>% 
                                  str_replace_all(popkey$Pop) %>% 
                                  str_replace("Nm_", "4Nem ") 
                                  

names(parameter_key) <- posterior_parameters

Check Convergence

# posteriors.grouped <- posteriors %>% 
#                       select(all_of(c("Locus",
#                                         "Steps",
#                                         posterior_parameters))) %>% 
#                       group_by(rep)

# posterior.reps <- group_split(posterior.grouped)
# posterior.key <- group_keys(posterior.grouped)

#thin by taking every other step
posteriors2 <- lapply(posteriors,filter,Steps%%500==0)


# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_", 
                                                          "Nm_","D_","d_")))

                        
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)

ggmcmc(posterior.ggs, param_page = 4)







#if I wanted to pull out parameters by locus, but I don't - Peter calculates Effective Sizes on the whole posterior

posteriors3 <- posteriors15 %>% select(starts_with(c("Replicate","Steps","Locus","Theta_", "M_", 
                                                          "Nm_","D_","d_"))) %>% mutate(Locus_name = Locus)

posteriors3_Nm <- posteriors3 %>% select(starts_with(c("Replicate","Steps","Locus", 
                                                          "Nm_"))

posteriors3_Nm <- posteriors3 %>% pivot_wider(id_cols = c("Steps", "Replicate","Locus"), 
                                                            names_from = "Locus_name",
                                               values_from =starts_with(c("Theta_", "M_", 
                                                          "Nm_","D_","d_")), names_sep = "-L")

posteriors3_Nm.mcmc <- mcmc(posteriors3_Nm)

effectiveSize(posteriors3_Nm.mcmc)
Effective Sizes

Effective sizes are good

effectiveSize(posteriors2[[1]]) (first replicate)

 Theta_1   Theta_2   Theta_3   Theta_4   Theta_5   Theta_6   Theta_7   Theta_8   Theta_9     M_3_2     M_2_3     M_4_3     M_3_4 
140194.20  99698.77 114330.54  65325.45  59135.13  73193.97 109083.37  70626.00 107475.10  50591.32 172911.83  90997.00 269569.09 
    M_5_4     M_4_5     M_6_5     M_5_6     M_7_6     M_6_7     M_8_7     M_7_8     M_9_8     M_8_9     D_6_1 
327963.24 297808.14 273528.90 337943.73 270026.70 354645.35 350985.75 653891.00 653891.00 653891.00 170758.40 

Gelman-Rubin Diagnostic

Gelman-Rubin diagnostic across 5 replicates is a bit high for some M_ parameters, but all below 1.2.


gelman.diag(posteriors.mcmc, autoburnin = F)

Potential scale reduction factors:

        Point est. Upper C.I.
Theta_1       1.00       1.00
Theta_2       1.00       1.01
Theta_3       1.01       1.02
Theta_4       1.01       1.01
Theta_5       1.01       1.03
Theta_6       1.00       1.00
Theta_7       1.00       1.00
Theta_8       1.00       1.01
Theta_9       1.01       1.01
M_3_2         1.14       1.17
M_2_3         1.13       1.13
M_4_3         1.12       1.12
M_3_4         1.07       1.07
M_5_4         1.15       1.15
M_4_5         1.00       1.00
M_6_5         1.01       1.01
M_5_6         1.12       1.12
M_7_6         1.06       1.06
M_6_7         1.10       1.10
M_8_7         1.23       1.24
M_7_8         1.10       1.10
M_9_8         1.12       1.12
M_8_9         1.21       1.25
D_6_1         1.00       1.00

Multivariate psrf

1.01

gelman.diag(posteriors.mcmc, transform = T,autoburnin=F)
Potential scale reduction factors:

Potential scale reduction factors:

        Point est. Upper C.I.
Theta_1       1.00       1.00
Theta_2       1.00       1.00
Theta_3       1.01       1.02
Theta_4       1.00       1.01
Theta_5       1.01       1.02
Theta_6       1.00       1.00
Theta_7       1.00       1.00
Theta_8       1.00       1.00
Theta_9       1.00       1.00
M_3_2         1.01       1.01
M_2_3         1.00       1.01
M_4_3         1.01       1.02
M_3_4         1.00       1.01
M_5_4         1.00       1.01
M_4_5         1.00       1.01
M_6_5         1.00       1.01
M_5_6         1.00       1.01
M_7_6         1.00       1.00
M_6_7         1.01       1.01
M_8_7         1.01       1.02
M_7_8         1.01       1.01
M_9_8         1.01       1.02
M_8_9         1.01       1.02
D_6_1         1.00       1.00

Multivariate psrf

1.02

Theta Estimates

And here I run the code for the \(\theta\) estimates

parameter_type <- "Theta"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
thetas <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,0.001,0.1))

#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")

theta.colors <- rev(brewer.pal(9,"Spectral"))
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)

#plot
thetas_density <-  ggplot(thetas_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(linewidth= 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = theta.colors, 
                             labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.175) + xlim(0,0.01)

thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity", scale = 5) + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\Theta$")) + xlim(0,0.01)

thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x, 
                                          violinwidth = Density*10, 
                                          fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = theta.colors, labels = parameter_key) +
        scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(y = TeX("$\\Theta$")) + ylim(0,0.01)

#ggsave(filename = "thetas_density.jpg",plot=thetas_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "thetas_ridges.jpg",plot=thetas_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "thetas_violins.jpg",plot=thetas_violins, path = "./figures/stepping.stone.KOd")

# get stats for each summed-over-loci posterior

theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas, 
                                                             parameter = .x),
                                      .id = "parameter")

#write_csv(theta_stats,"./tables/stepping.stone.KOd/theta_stats.csv")

Theta estimates:

thetastats <- read_csv("./tables/stepping.stone.KOd/theta_stats.csv")
## Rows: 9 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
thetastats
## # A tibble: 9 × 9
##   parameter      `2.5%`   `25%`   `50%`   `75%` `97.5%`    mode    mean       sd
##   <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
## 1 theta Kure    0.00393 0.00418 0.00425 0.00435 0.00520 0.00423 0.00429  2.42e-4
## 2 theta P&H     0.00132 0.00146 0.00152 0.00159 0.00173 0.00150 0.00153  1.04e-4
## 3 theta Maro    0.00667 0.00670 0.00671 0.00672 0.00675 0.00671 0.00671  2.10e-5
## 4 theta FFS     0.00778 0.00780 0.00782 0.00783 0.00786 0.00782 0.00781  6.22e-5
## 5 theta Kauai   0.00773 0.00778 0.00780 0.00783 0.00788 0.00779 0.00780  3.97e-5
## 6 theta Oahu    0.00740 0.00745 0.00747 0.00749 0.00753 0.00748 0.00747  3.33e-5
## 7 theta Molokai 0.00691 0.00697 0.00701 0.00707 0.00716 0.00699 0.00702  7.19e-5
## 8 theta Maui    0.00726 0.00731 0.00734 0.00736 0.00740 0.00734 0.00734  4.51e-5
## 9 theta Big Is. 0.00498 0.00513 0.00536 0.00545 0.00623 0.00541 0.00540  3.62e-4

Theta Density Theta Ridges

Theta Violins
Theta Violins

m/mu estimates

parameter_type <- "M"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
Ms <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,1,1000), n = 2^14)

#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density")


migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
M_density <-  ggplot(Ms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        scale_x_log10(limits = c(1e-2,100)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted")
  
           
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_x_log10(limits = c(1e-2,100)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$\\frac{m}{\\mu}$"))

  
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") +
        scale_y_log10(limits = c(1e-2,100)) +
        scale_fill_discrete(type = migration.colors, labels = parameter_key) 
# get stats for each summed-over-loci posterior



Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms, 
                                                             parameter = .x),
                                      .id = "parameter")

#ggsave(filename = "M_density.jpg",plot=M_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "M_ridges.jpg",plot=M_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "M_violins.jpg",plot=M_violins, path = "./figures/stepping.stone.KOd")
#write_csv(Ms_stats,"./tables/stepping.stone.KOd/Mmu_stats.csv")

M/mu estimates:

mmustats <- read_csv("./tables/stepping.stone.KOd/Mmu_stats.csv")
## Rows: 14 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mmustats
## # A tibble: 14 × 9
##    parameter            `2.5%` `25%` `50%` `75%` `97.5%`  mode  mean     sd
##    <chr>                 <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>
##  1 m/mu Maro to P&H      1.28  1.34  1.40  1.40    1.46  1.40  1.37  0.0663
##  2 m/mu P&H to Maro      0.916 1.04  1.04  1.10    1.28  1.04  1.07  0.0989
##  3 m/mu FFS to Maro      0.244 0.244 0.244 0.244   0.305 0.244 0.249 0.0434
##  4 m/mu Maro to FFS      1.53  2.01  2.08  2.08    2.14  2.08  1.96  0.251 
##  5 m/mu Kauai to FFS     0.183 0.244 0.244 0.244   0.305 0.244 0.253 0.0596
##  6 m/mu FFS to Kauai     0.183 0.244 0.244 0.244   0.305 0.244 0.244 0.0532
##  7 m/mu Oahu to Kauai    0.122 0.122 0.183 0.244   0.305 0.183 0.182 0.0634
##  8 m/mu Kauai to Oahu    0.183 0.183 0.183 0.244   0.305 0.183 0.212 0.0582
##  9 m/mu Molokai to Oahu  0.122 0.183 0.183 0.183   0.244 0.183 0.186 0.0449
## 10 m/mu Oahu to Molokai  0.305 0.366 0.366 0.366   0.427 0.366 0.364 0.0540
## 11 m/mu Maui to Molokai  1.04  1.16  1.22  1.28    1.34  1.16  1.20  0.110 
## 12 m/mu Molokai to Maui  0.183 0.183 0.244 0.244   0.244 0.244 0.222 0.0435
## 13 m/mu Big Is. to Maui  1.04  1.10  1.16  1.16    1.16  1.16  1.13  0.0548
## 14 m/mu Maui to Big Is.  0.916 0.977 0.977 1.04    1.04  0.977 0.992 0.0461

Mmu Density Mmu Ridges

Mmu Violins
Mmu Violins

4Nem estimates

parameter_type <- "Nm"

parameters <- names(posteriors12) %>%
                        str_subset(parameter_type)

parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]

#summarize the posterior for theta
nms <- summarize_posterior(posteriors12, parameter_type = parameter_type, 
                              prior_params = c(0,1,1), n = 2^17)

#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>% 
                pivot_longer(starts_with(c(paste0(parameter_type,"_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
                filter(Parameter != "Nm_6_1")

migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
Nm_density <-  ggplot(nms_long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-4,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        geom_line(aes(x = x, y=prior), color = "grey",
                  linetype="dotted") +
        ylim(0,0.125) + xlim(0,0.003)
  
           
#plot
Nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density, 
                    fill = Parameter)) +
        geom_density_ridges(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
        labs(x = TeX("$4N_em$")) +
        xlim(0,0.003)
  
Nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20, 
                    fill = Parameter)) +
        geom_violin(stat = "identity") + 
        scale_fill_discrete(type = migration.colors, labels = parameter_key) +
        ylim(0,0.003)
# get stats for each summed-over-loci posterior

Nm_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms, 
                                                             parameter = .x),
                                      .id = "parameter")

#ggsave(filename = "Nm_density.jpg",plot=Nm_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "Nm_ridges.jpg",plot=Nm_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "Nm_violins.jpg",plot=Nm_violins, path = "./figures/stepping.stone.KOd")
#write_csv(Nm_stats,"./tables/stepping.stone.KOd/Nm_stats.csv")

\(4N_em\) estimates:

nmstats <- read_csv("./tables/stepping.stone.KOd/Nm_stats.csv")
## Rows: 14 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nmstats
## # A tibble: 14 × 9
##    parameter      `2.5%`   `25%`   `50%`   `75%` `97.5%`    mode    mean      sd
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 4Nem Maro to… 4.73e-4 5.57e-4 6.10e-4 6.64e-4 7.86e-4 6.03e-4 6.14e-4 8.12e-5
##  2 4Nem P&H to … 5.42e-4 6.18e-4 6.64e-4 7.02e-4 7.93e-4 6.56e-4 6.62e-4 6.50e-5
##  3 4Nem FFS to … 2.06e-4 2.37e-4 2.52e-4 2.67e-4 3.05e-4 2.44e-4 2.51e-4 2.71e-5
##  4 4Nem Maro to… 1.17e-3 1.32e-3 1.40e-3 1.50e-3 1.71e-3 1.39e-3 1.41e-3 1.39e-4
##  5 4Nem Kauai t… 5.19e-4 5.95e-4 6.33e-4 6.79e-4 7.71e-4 6.26e-4 6.36e-4 6.46e-5
##  6 4Nem FFS to … 1.21e-3 1.41e-3 1.53e-3 1.66e-3 1.95e-3 1.48e-3 1.54e-3 1.87e-4
##  7 4Nem Oahu to… 1.10e-3 1.29e-3 1.40e-3 1.50e-3 1.77e-3 1.41e-3 1.40e-3 1.71e-4
##  8 4Nem Kauai t… 9.23e-4 1.06e-3 1.14e-3 1.24e-3 1.43e-3 1.13e-3 1.15e-3 1.28e-4
##  9 4Nem Molokai… 8.54e-4 9.77e-4 1.05e-3 1.13e-3 1.30e-3 1.04e-3 1.06e-3 1.17e-4
## 10 4Nem Oahu to… 6.94e-4 7.86e-4 8.39e-4 9.00e-4 1.03e-3 8.32e-4 8.47e-4 8.83e-5
## 11 4Nem Maui to… 1.10e-3 1.27e-3 1.36e-3 1.46e-3 1.66e-3 1.36e-3 1.36e-3 1.46e-4
## 12 4Nem Molokai… 4.27e-4 4.81e-4 5.11e-4 5.42e-4 6.10e-4 5.04e-4 5.11e-4 4.87e-5
## 13 4Nem Big Is.… 1.11e-3 1.24e-3 1.31e-3 1.39e-3 1.56e-3 1.30e-3 1.32e-3 1.15e-4
## 14 4Nem Maui to… 1.25e-3 1.76e-3 2.11e-3 2.37e-3 2.80e-3 2.23e-3 2.07e-3 4.23e-4

4Nem Density 4Nem Ridges

4Nem Violins #### Divergence Estimate Oahu-Kure

This is the divergence

dens <- tibble(.rows = 2^15, x = density(getx[,1],  n=2^15, 
                                      from = 0, 
                                      to = 0.0001,
                                      bw = "nrd0")$x)

dens2 <- posteriors12 %>%
          select(starts_with(c("Steps","Locus","rep",
                paste0(parameter_type,"_")))) %>%
          pivot_wider(names_from = "Locus", values_from =
                      starts_with(paste0(parameter_type,"_")),
                      names_sep = "-") %>%
          select(4:112) %>%
          map_dfc(function(x) density(x, n = 2^15, from = 0, to = 0.001, bw = "nrd0")$y)

dens2<-bind_cols(dens,dens2)

dens2$prior <- dexp(dens2$x, rate = 1/0.001,log = F)
dens2$prior <- dens2$prior/sum(dens2$prior)
dens2$logPrior <- log(dens2$prior)


dens4 <- dens2 %>%
        #remove the prior, standardize
        mutate(across(starts_with(parameter_type),~ remove_prior(densityd = .x,
                                                                 prior = dens2$prior,threshold = 1e-10) ))
names(dens4)[1:109] <- paste0("D_6_1-",names(dens4)[1:109])

dens5 <- sum_over_loci(df = dens4, parameter = "D_6_1") 
dens5 <- bind_cols(dens5, x=dens$x)

#convert to long format
D_61long <- dens5 %>% 
                pivot_longer(starts_with(c(paste0("D","_"))), 
                            names_to = "Parameter",
                            values_to = "Density") %>% 
                filter(Parameter == "D_6_1")


D_61long <- bind_cols(D_61long,x=dens$x)

#migration.colors <- c(rbind(theta.colors, theta.colors.dark))            
#plot
D_61density <-  ggplot(D_61long, aes(x = x, y = Density, 
                    color = Parameter)) +
        geom_line(size = 1) + 
        #scale_x_log10(limits = c(5e-5,1e-2)) + 
        scale_color_discrete(type = migration.colors, labels = parameter_key) +
        #geom_line(aes(x = x, y=prior), color = "grey",
        #          linetype="dotted") +
        ylim(0,0.0002) + xlim(0,0.0002)
  
           

# get stats for each summed-over-loci posterior

D_61stats <-posterior_stats(df = dens5, parameter = "D_6_1")

#ggsave(filename = "D_6_1.jpg",plot=D_61density, path = "./figures/stepping.stone.KOd")
#write.csv(D_61stats ,"./tables/stepping.stone.KOd/D_6_1.csv",quote=F)
Divergence Density
Divergence Density
D61stats <- read.csv("./tables/stepping.stone.KOd/D_6_1.csv")
D61stats
##       X            x
## 1  2.5% 5.306864e-05
## 2   25% 6.368908e-05
## 3   50% 6.983551e-05
## 4   75% 7.629627e-05
## 5 97.5% 8.960540e-05
## 6  mode 6.932279e-05
## 7  mean 7.021043e-05
## 8    sd 9.304214e-06

Divergence time in coalescent units, following along on page 25 here. Beerli et al. (2019)

t <- tau / 1.2e-8
Beerli, Peter, Somayeh Mashayekhi, Marjan Sadeghi, Marzieh Khodaei, and Kyle Shaw. 2019. “Population Genetic Inference With MIGRATE.” Current Protocols in Bioinformatics 68 (1): e87. https://doi.org/10.1002/cpbi.87.